1 Introduction

1.1 Loading main setting

The code below prepares an analysis environment and set-up all required libraries and functions for further analysis.

options(warn = -1)

options(encoding = "UTF-8")

# Clear workspace
rm(list = ls())

# Set language to En
Sys.setlocale(category = "LC_ALL", locale = "english")
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
# Installing and loading libraries
libraries <- c("bib2df",
               "dplyr",
               "ggplot2",
               "gridExtra",
               "grid",
               "kableExtra",
               "readxl",
               "tidyr", 
               "treemapify",
               "scales",
               "cowplot",
               "colorspace")

if (length(setdiff(libraries, rownames(installed.packages()))) > 0) {
  install.packages(setdiff(libraries, rownames(installed.packages())), dependencies = T)
}

sapply(libraries, function(libName) {
  library(libName, character.only = TRUE)
})
## $bib2df
## [1] "bib2df"    "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"     
## 
## $dplyr
## [1] "dplyr"     "bib2df"    "stats"     "graphics"  "grDevices" "utils"    
## [7] "datasets"  "methods"   "base"     
## 
## $ggplot2
##  [1] "ggplot2"   "dplyr"     "bib2df"    "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"     
## 
## $gridExtra
##  [1] "gridExtra" "ggplot2"   "dplyr"     "bib2df"    "stats"     "graphics" 
##  [7] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## $grid
##  [1] "grid"      "gridExtra" "ggplot2"   "dplyr"     "bib2df"    "stats"    
##  [7] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## $kableExtra
##  [1] "kableExtra" "grid"       "gridExtra"  "ggplot2"    "dplyr"     
##  [6] "bib2df"     "stats"      "graphics"   "grDevices"  "utils"     
## [11] "datasets"   "methods"    "base"      
## 
## $readxl
##  [1] "readxl"     "kableExtra" "grid"       "gridExtra"  "ggplot2"   
##  [6] "dplyr"      "bib2df"     "stats"      "graphics"   "grDevices" 
## [11] "utils"      "datasets"   "methods"    "base"      
## 
## $tidyr
##  [1] "tidyr"      "readxl"     "kableExtra" "grid"       "gridExtra" 
##  [6] "ggplot2"    "dplyr"      "bib2df"     "stats"      "graphics"  
## [11] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## $treemapify
##  [1] "treemapify" "tidyr"      "readxl"     "kableExtra" "grid"      
##  [6] "gridExtra"  "ggplot2"    "dplyr"      "bib2df"     "stats"     
## [11] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [16] "base"      
## 
## $scales
##  [1] "scales"     "treemapify" "tidyr"      "readxl"     "kableExtra"
##  [6] "grid"       "gridExtra"  "ggplot2"    "dplyr"      "bib2df"    
## [11] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [16] "methods"    "base"      
## 
## $cowplot
##  [1] "cowplot"    "scales"     "treemapify" "tidyr"      "readxl"    
##  [6] "kableExtra" "grid"       "gridExtra"  "ggplot2"    "dplyr"     
## [11] "bib2df"     "stats"      "graphics"   "grDevices"  "utils"     
## [16] "datasets"   "methods"    "base"      
## 
## $colorspace
##  [1] "colorspace" "cowplot"    "scales"     "treemapify" "tidyr"     
##  [6] "readxl"     "kableExtra" "grid"       "gridExtra"  "ggplot2"   
## [11] "dplyr"      "bib2df"     "stats"      "graphics"   "grDevices" 
## [16] "utils"      "datasets"   "methods"    "base"
# Information about session
sessionInfo()
## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## system code page: 65001
## 
## time zone: Europe/Warsaw
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] colorspace_2.1-1 cowplot_1.1.3    scales_1.3.0     treemapify_2.5.6
##  [5] tidyr_1.3.1      readxl_1.4.3     kableExtra_1.4.0 gridExtra_2.3   
##  [9] ggplot2_3.5.1    dplyr_1.1.4      bib2df_1.1.2.0  
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.9         utf8_1.2.4         generics_0.1.3     xml2_1.3.6        
##  [5] stringi_1.8.4      digest_0.6.37      magrittr_2.0.3     evaluate_1.0.1    
##  [9] bookdown_0.41      fastmap_1.2.0      cellranger_1.1.0   jsonlite_1.8.9    
## [13] httr_1.4.7         purrr_1.0.2        fansi_1.0.6        viridisLite_0.4.2 
## [17] jquerylib_0.1.4    cli_3.6.3          rlang_1.1.4        munsell_0.5.1     
## [21] withr_3.0.2        cachem_1.1.0       yaml_2.3.10        tools_4.3.2       
## [25] vctrs_0.6.5        R6_2.5.1           lifecycle_1.0.4    stringr_1.5.1     
## [29] pkgconfig_2.0.3    pillar_1.9.0       bslib_0.8.0        gtable_0.3.6      
## [33] glue_1.8.0         Rcpp_1.0.13-1      ggfittext_0.10.2   systemfonts_1.1.0 
## [37] xfun_0.49          tibble_3.2.1       tidyselect_1.2.1   rstudioapi_0.17.1 
## [41] knitr_1.49         htmltools_0.5.8.1  rmarkdown_2.29     svglite_2.1.3     
## [45] humaniformat_0.6.0 compiler_4.3.2
# Function to clean and read BibTeX file
load_bibliography <- function(bib_file) {
  # Read and clean the BibTeX file
  lines <- base::readLines(bib_file, warn = FALSE)  # Suppress warnings
  lines <- base::gsub("[^[:print:]]", "", lines)  # Remove non-printable characters
  
  # Write cleaned lines to a temporary file
  temp_file <- base::tempfile(fileext = ".bib")
  base::writeLines(lines, temp_file)
  
  # Read the cleaned BibTeX file
  bib_df <- tryCatch(
    bib2df::bib2df(temp_file),
    error = function(e) {
      base::message("Error reading BibTeX file: ", e$message)
      return(NULL)
    }
  )
  
  if (is.null(bib_df))
    return(NULL)
  
  return(bib_df)
}
environment(load_bibliography) <- new.env(parent = baseenv())

create_bib_summary <- function(bib_df) {
  library(dplyr)
  
  # Explicitly import the pipe operator
  `%>%` <- dplyr::`%>%`
  
  # Extract required columns (ENTRYTYPE and YEAR)
  bib_summary <- bib_df %>%
    dplyr::mutate(
      entry_type = base::tolower(CATEGORY),
      # Get the entry type
      year = as.numeric(YEAR),
      # Ensure the year is numeric
      journal = JOURNAL
    ) %>%
    dplyr::filter(!is.na(year))               # Remove entries with missing year
  
  return(bib_summary)
}
environment(create_bib_summary) <- new.env(parent = baseenv())

plot_publication_distribution <- function(bib_summary) {
  # Import pipe manually
  `%>%` <- magrittr::`%>%`
  
  # Check for valid data
  if (is.null(bib_summary) || nrow(bib_summary) == 0) {
    base::message("No valid data to plot.")
    return(NULL)
  }
  
  # Build distribution and labels
  type_distribution <- bib_summary %>%
    dplyr::count(entry_type) %>%
    dplyr::mutate(
      prop       = n / base::sum(n) * 100,
      label      = base::paste0(entry_type, "\n", n, " (", base::round(prop, 1), "%)"),
      entry_type = base::as.character(entry_type)
    )
  
  # Create a consistent color palette
  global_palette <- stats::setNames(
    grDevices::colorRampPalette(ggsci::pal_jco("default")(3))(nrow(type_distribution)),
    type_distribution$entry_type
  )
  
  base_color <- global_palette[1]
  
  # Create the bar chart of total publications per year with a uniform blue fill
  trend_plot <- ggplot2::ggplot(bib_summary, ggplot2::aes(x = year)) +
    ggplot2::geom_bar(fill = base_color, color = "white") +
    ggplot2::labs(title = "", x     = "Publication Year", y     = "Number of Publications") +
    ggplot2::theme_minimal() +
    ggplot2::theme(
      axis.text.x     = ggplot2::element_text(angle = 45, hjust = 1),
      plot.title      = ggplot2::element_text(
        hjust = 0.5,
        size = 14,
        face = "plain"
      )
    )
  
  
  # Pie chart with the same palette
  pie_chart <- ggplot2::ggplot(type_distribution,
                               ggplot2::aes(x = "", y = prop, fill = entry_type)) +
    ggplot2::geom_col(width = 1, color = "white") +
    ggplot2::coord_polar(theta = "y") +
    ggplot2::scale_fill_manual(values = global_palette, drop = TRUE, name = "Group") +
    ggplot2::theme_void() +
    ggrepel::geom_label_repel(
      ggplot2::aes(label = label),
      position     = ggplot2::position_stack(vjust = 0.5),
      size         = 4,
      show.legend  = FALSE,
      segment.color = "grey50"
    ) +
    ggplot2::theme(legend.position = "bottom")
  
  # Title grob Publication Trends and Type Proportions
  title_grob <- grid::textGrob(
    "",
    gp = grid::gpar(fontsize = 14, fontface = "plain")
  )
  
  # Combine plots
  combined_plot <- gridExtra::grid.arrange(
    title_grob,
    gridExtra::arrangeGrob(trend_plot, pie_chart, ncol = 2),
    ncol   = 1,
    heights = c(0.1, 1)
  )
  
  return(combined_plot)
}
# Isolate environment to avoid masking
environment(plot_publication_distribution) <- baseenv()

# Function to aggregate articles by category and journal
aggregate_by_journal <- function(bib_summary) {
  library(dplyr)
  
  # Explicitly import the pipe operator
  `%>%` <- dplyr::`%>%`
  
  journal_aggregation <- bib_summary %>%
    dplyr::group_by(entry_type, journal) %>%
    dplyr::summarise(article_count = dplyr::n(), .groups = 'drop') %>%
    dplyr::arrange(desc(article_count))
  
  return(journal_aggregation)
}
environment(aggregate_by_journal) <- new.env(parent = baseenv())

create_pie_chart <- function(summary_df,
                             title = "",
                             subtitle = "",
                             global_palette) {
  # Pie chart with the same palette
  pie_chart <- ggplot2::ggplot(summary_df, ggplot2::aes(x = "", y = percentage, fill = group)) +
    ggplot2::geom_col(width = 1, color = "white") +
    ggplot2::coord_polar(theta = "y") +
    ggplot2::scale_fill_manual(
      values = global_palette,
      drop = TRUE,
      guide = ggplot2::guide_legend(title = "Group")
    ) +
    ggplot2::labs(title = title, subtitle = subtitle) +
    ggplot2::theme_void() +
    ggrepel::geom_label_repel(
      ggplot2::aes(label = label),
      position     = ggplot2::position_stack(vjust = 0.5),
      size         = 4,
      show.legend  = FALSE,
      segment.color = "grey50"
    ) +
    ggplot2::theme(
      plot.title      = ggplot2::element_text(hjust = 0.5, size = 14, face = "plain"),
      plot.subtitle = ggplot2::element_text(hjust = 0.5),
      legend.position    = "right",
      legend.direction   = "vertical",
      legend.title       = ggplot2::element_text(face = "plain"),
      legend.text        = ggplot2::element_text(size = 10)
    )
  
  return(pie_chart)
}
environment(create_pie_chart) <- new.env(parent = baseenv())

create_treemap_chart <- function(summary_df,
                                 title = "",
                                 subtitle = "",
                                 global_palette,
                                 text_size = 11,
                                 text_color = "white",
                                 show_values = TRUE,
                                 value_var = "count",
                                 label_var = "label",
                                 group_var = "group",
                                 sort_by = "count",
                                 sort_desc = TRUE) {
  # Require necessary packages
  if (!requireNamespace("treemapify", quietly = TRUE)) {
    stop("Package 'treemapify' is needed for this function. Please install it.")
  }
  
  # Sort the data based on specified column
  if (sort_by %in% names(summary_df)) {
    summary_df <- summary_df[order(summary_df[[sort_by]], decreasing = sort_desc), ]
    
    # Convert group to factor with levels in the sorted order to maintain sorting
    summary_df[[group_var]] <- factor(summary_df[[group_var]], levels = unique(summary_df[[group_var]]))
  }
  
  # Create the treemap
  treemap_chart <- ggplot2::ggplot(summary_df,
                                   ggplot2::aes(
                                     area = .data[[value_var]],
                                     fill = .data[[group_var]],
                                     label = if (show_values)
                                       .data[[label_var]]
                                     else
                                       .data[[group_var]]
                                   )) +
    treemapify::geom_treemap() +
    treemapify::geom_treemap_text(
      colour = text_color,
      place = "centre",
      size = text_size,
      fontface = "bold"
    ) +
    ggplot2::scale_fill_manual(
      values = global_palette,
      drop = TRUE,
      guide = ggplot2::guide_legend(title = "Group")
    ) +
    ggplot2::labs(title = title, subtitle = subtitle) +
    ggplot2::theme_minimal() +
    ggplot2::theme(
      legend.position = "right",
      legend.direction = "vertical",
      legend.title = ggplot2::element_text(face = "plain"),
      legend.text = ggplot2::element_text(size = 10),
      plot.title = ggplot2::element_text(
        size = 14,
        face = "plain",
        hjust = 0.5
      ),
      plot.subtitle = ggplot2::element_text(size = 12, hjust = 0.5)
    )
  
  return(treemap_chart)
}
environment(create_treemap_chart) <- new.env(parent = baseenv())

create_chartexpo_radial <- function(data_df,
                                    category_var,
                                    value_var,
                                    label_var = NULL,
                                    title = "",
                                    subtitle = "",
                                    color_palette = c("#4472C4",
                                                      "#ED7D31",
                                                      "#A5A5A5",
                                                      "#FFC000",
                                                      "#5B9BD5",
                                                      "#70AD47"),
                                    max_value = NULL,
                                    show_labels = TRUE,
                                    show_legend = TRUE) {
  # Ensure the data is properly sorted for radial display
  data_df <- data_df[order(data_df[[category_var]]), ]
  
  # Set maximum value for consistent scaling
  if (is.null(max_value)) {
    max_value <- max(data_df[[value_var]]) * 1.1  # Add 10% buffer
  }
  
  # Calculate positions and prepare data
  n_categories <- nrow(data_df)
  data_df$id <- 1:n_categories
  data_df$angle <- 2 * pi * (data_df$id - 1) / n_categories
  
  # Assign colors
  if (length(color_palette) < n_categories) {
    color_palette <- rep_len(color_palette, n_categories)
  }
  data_df$color <- color_palette[1:n_categories]
  
  # Create the base plot
  radial_plot <- ggplot2::ggplot(data_df) +
    # Add background grid circles
    ggplot2::geom_hline(
      yintercept = seq(0, max_value, length.out = 5),
      color = "gray90",
      size = 0.5
    ) +
    # Add background grid lines
    ggplot2::geom_vline(
      xintercept = seq(0, 2 * pi, length.out = n_categories + 1)[1:n_categories],
      color = "gray90",
      size = 0.5
    ) +
    # Add the radial bars
    ggplot2::geom_col(
      ggplot2::aes(x = angle, y = .data[[value_var]], fill = .data[[category_var]]),
      width = 2 * pi / (n_categories * 1.5),
      alpha = 0.85
    ) +
    # Add points at the end of each bar
    ggplot2::geom_point(ggplot2::aes(x = angle, y = .data[[value_var]], color = .data[[category_var]]), size = 3) +
    # Add value labels outside the bars
    ggplot2::geom_text(
      ggplot2::aes(
        x = angle,
        y = .data[[value_var]] * 1.15,
        # Position outside the bar
        label = if (!is.null(label_var))
          .data[[label_var]]
        else
          .data[[value_var]],
      ),
      hjust = 0.5,
      vjust = 0.5,
      size = 4,
      # Increased font size
      fontface = "bold",
      color = "black"
    ) +
    # Add category labels if requested
    {
      if (show_labels)
        ggplot2::geom_text(
          ggplot2::aes(
            x = angle,
            y = max_value * 1.3,
            # Moved further out for more space
            label = .data[[category_var]]
          ),
          hjust = 0.5,
          vjust = 0.5,
          size = 4.5,
          # Increased font size
          fontface = "bold"
        )
    } +
    # Configure the polar coordinates
    ggplot2::coord_polar() +
    # Set custom colors
    ggplot2::scale_fill_manual(values = color_palette) +
    ggplot2::scale_color_manual(values = color_palette) +
    # Set limits - increased to accommodate the labels
    ggplot2::ylim(0, max_value * 1.4) +
    # Add titles
    ggplot2::labs(title = title, subtitle = subtitle) +
    # Theme customization to match ChartExpo style
    ggplot2::theme_minimal() +
    ggplot2::theme(
      axis.text = ggplot2::element_blank(),
      axis.title = ggplot2::element_blank(),
      axis.ticks = ggplot2::element_blank(),
      panel.grid = ggplot2::element_blank(),
      plot.title = ggplot2::element_text(
        size = 14,
        face = "plain",
        hjust = 0.5
      ),
      plot.subtitle = ggplot2::element_text(size = 12, hjust = 0.5),
      legend.position = if (show_legend)
        "bottom"
      else
        "none",
      legend.title = ggplot2::element_blank(),
      plot.margin = ggplot2::unit(c(1, 1, 1, 1), "cm")
    )
  
  return(radial_plot)
}
environment(create_chartexpo_radial) <- new.env(parent = baseenv())

create_bubble_plot <- function(summary_df,
                               title = "",
                               subtitle = "",
                               global_palette) {
  bubble_plot <- ggplot2::ggplot(summary_df,
                                 ggplot2::aes(
                                   x = YEAR,
                                   y = group,
                                   size = n,
                                   fill = group
                                 )) +
    ggplot2::geom_point(shape = 21,
                        color = "white",
                        alpha = 0.8) +
    # explicitly request a size legend, and give it point glyphs
    ggplot2::scale_size_area(
      max_size = 15,
      guide = ggplot2::guide_legend(
        title        = "Count",
        override.aes = base::list(
          shape = 21,
          fill  = "grey70",
          # or some neutral colour
          color = "white",
          # match your points
          alpha = 0.8
        )
      )
    ) +
    ggplot2::scale_fill_manual(values = global_palette,
                               guide = ggplot2::guide_legend(title = "Group")) +
    ggplot2::labs(x = "Year", y = NULL) +
    ggplot2::theme_minimal() +
    ggplot2::theme(
      axis.text.x      = ggplot2::element_text(angle = 45, hjust = 1),
      panel.grid.major.y = ggplot2::element_blank(),
      legend.position    = "right",
      legend.direction   = "vertical",
      legend.title       = ggplot2::element_text(
        face = "plain",
        hjust = 0.5
      ),
      legend.text        = ggplot2::element_text(size = 10)
    )
  
  return(bubble_plot)
  
}
environment(create_bubble_plot) <- new.env(parent = baseenv())

create_summary_df <- function(df, column_name) {
  library(dplyr)
  
  # Explicitly import the pipe operator
  `%>%` <- dplyr::`%>%`
  
  df %>%
    dplyr::count(!!dplyr::sym(column_name)) %>%
    dplyr::mutate(
      percentage = base::round(n / base::sum(n) * 100, 1),
      label = base::paste0(n, " (", percentage, "%)"),
      column = column_name  # Add column name for faceting
    )  %>% dplyr::rename_with( ~ "group", 1) %>% dplyr::rename_with( ~ "count", 2)
}
environment(create_summary_df) <- new.env(parent = baseenv())

create_bubble_df <- function(year_df, references_categories) {

  # Create a new dataframe for the bubble plot
  bubble_df <- base::data.frame(bibtexkey = base::character(),
                          group = base::character(),
                          stringsAsFactors = FALSE)
  
  # Loop through the references_categories list to extract bibtexkeys and their categories
  for (category in base::names(references_categories)) {
    keys <- references_categories[[category]]
    # Split in case there are multiple keys in one string (comma separated)
    keys <- base::unlist(base::strsplit(keys, ",\\s*"))
    
    # Create temporary dataframe for this category
    temp_df <- base::data.frame(
      bibtexkey = keys,
      group = category,
      stringsAsFactors = FALSE
    )
    
    # Append to the main dataframe
    bubble_df <- base::rbind(bubble_df, temp_df)
  }
  
  # Merge with original dataframe to get the YEAR
  bubble_df <- base::merge(bubble_df, year_df[, c("bibtexkey", "YEAR")], by = "bibtexkey", all.x = TRUE)
  
  has_any_na <- any(is.na(bubble_df$YEAR))
  
  if (has_any_na) {
    stop("Error: NA values found in the 'YEAR' column.")
  }
  
  # Count bibtexkeys by group and year
  summary_df <- stats::aggregate(bibtexkey ~ group + YEAR, data = bubble_df, FUN = length)
  base::names(summary_df)[3] <- "n"
  
  # If there are any NA values in YEAR, remove them
  summary_df <- summary_df[!base::is.na(summary_df$YEAR), ]
  
  # Sort summary_df by group and YEAR
  summary_df <- summary_df[base::order(summary_df$group, summary_df$YEAR), ]
  
  # Make sure YEAR is numeric
  summary_df$YEAR <-base::as.numeric(summary_df$YEAR)
  
  return(summary_df)
}
environment(create_bubble_df) <- new.env(parent = baseenv())

2 Checking a bibliography and generating basic statistics

Loading bibliographies.

bib_file <- "bibtex-information-fusion-document-classification.bib"
bib_df <- load_bibliography(bib_file)
head(bib_df)
## # A tibble: 6 x 39
##   CATEGORY BIBTEXKEY    ADDRESS ANNOTE AUTHOR BOOKTITLE CHAPTER CROSSREF EDITION
##   <chr>    <chr>        <chr>   <chr>  <list> <chr>     <chr>   <chr>    <chr>  
## 1 ARTICLE  wangq2022    <NA>    <NA>   <chr>  <NA>      <NA>    <NA>     <NA>   
## 2 ARTICLE  zhao2023     <NA>    <NA>   <chr>  <NA>      <NA>    <NA>     <NA>   
## 3 ARTICLE  cgoncalves2~ <NA>    <NA>   <chr>  <NA>      <NA>    <NA>     <NA>   
## 4 ARTICLE  reil2023     <NA>    <NA>   <chr>  <NA>      <NA>    <NA>     <NA>   
## 5 ARTICLE  debreuij2020 <NA>    <NA>   <chr>  <NA>      <NA>    <NA>     <NA>   
## 6 ARTICLE  liuw2021     <NA>    <NA>   <chr>  <NA>      <NA>    <NA>     <NA>   
## # i 30 more variables: EDITOR <list>, HOWPUBLISHED <chr>, INSTITUTION <chr>,
## #   JOURNAL <chr>, KEY <chr>, MONTH <chr>, NOTE <chr>, NUMBER <chr>,
## #   ORGANIZATION <chr>, PAGES <chr>, PUBLISHER <chr>, SCHOOL <chr>,
## #   SERIES <chr>, TITLE <chr>, TYPE <chr>, VOLUME <chr>, YEAR <dbl>,
## #   ISSN <chr>, ABSTRACT <chr>, DOI <chr>, KEYWORDS <chr>, URL <chr>,
## #   ISSUE <chr>, ISBN <chr>, COLLECTION <chr>, VENUE <chr>, PMID <chr>,
## #   ARCHIVEPREFIX <chr>, EPRINT <chr>, CITY <chr>
excel_file <- "2-articles-information-fusion-summarisation-cleaned.xlsx"
excel_df <- readxl::read_excel(excel_file, sheet = 1)
excel_df <- excel_df %>%
  filter(rowSums(!is.na(.)) > 0)
head(excel_df)
## # A tibble: 6 x 21
##     l.p doi        bibtexkey `Article title` `Short note` Please summarize or ~1
##   <dbl> <chr>      <chr>     <chr>           <chr>        <chr>                 
## 1     1 10.1016/j~ dacosta2~ "Providing a g~ "Multi-moda~ "The article \"Provid~
## 2     2 10.1007/9~ gallo2021 "Visual Word E~ "The study ~ "The paper \"Visual W~
## 3     3 10.1109/A~ gui2021   "Technology Fo~ "The study ~ "The paper \"Technolo~
## 4     4 10.18653/~ huc2021   "One-class Tex~ "The study ~ "The paper \"One-clas~
## 5     5 10.1145/3~ garg2021  "On-Device Doc~ "The study ~ "The paper \"On-Devic~
## 6     6 10.18653/~ ma2021    "On the (in)ef~ "The study ~ "The paper \"On the (~
## # i abbreviated name:
## #   1: `Please summarize or provide the most important details of the work. Use a maximum of five sentences.`
## # i 15 more variables: `What are the findings?` <chr>,
## #   `What are the challenges?` <chr>,
## #   `Identified the datasets used in the article` <chr>,
## #   `Disambiguated datasets names` <chr>,
## #   `What other models were selected for comparison?` <chr>, ...
table(unique(sort(excel_df$bibtexkey)) == unique(sort(bib_df$BIBTEXKEY)))
## 
## TRUE 
##  139
bib_summary <- create_bib_summary(bib_df)
plotPubDist <- plot_publication_distribution(bib_summary)

ggsave(
  "fig-year-and-publication-type-pie-bar.pdf",
  plot = plotPubDist,
  width = 14,
  height = 8
)
journal_aggregation <- aggregate_by_journal(bib_summary)
knitr::kable(journal_aggregation, caption = "Publication type") %>% kableExtra::kable_styling()
Table 2.1: Table 2.2: Publication type
entry_type journal article_count
inbook Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics) 12
article Neurocomputing 5
article Pattern Recognition 5
article Applied Sciences 4
article Expert Systems with Applications 4
article IEEE Access 3
article Information 3
inproceedings CEUR Workshop Proceedings 3
article IEEE Transactions on Pattern Analysis and Machine Intelligence 2
article Information Fusion 2
article Information Sciences 2
article International Journal on Document Analysis and Recognition (IJDAR) 2
article Journal of Machine Learning Research 2
article Multimedia Tools and Applications 2
inbook NA 2
inproceedings Findings of the Association for Computational Linguistics: EMNLP 2022 2
inproceedings Proceedings - International Conference on Pattern Recognition 2
article ACM Transactions on Asian and Low-Resource Language Information Processing 1
article ACM Transactions on Knowledge Discovery from Data 1
article Applied Intelligence 1
article Artificial Intelligence Review 1
article CEUR Workshop Proceedings 1
article Computaci<U+FFFD>n y Sistemas 1
article Computer Standards &amp; Interfaces 1
article Computers &amp; Geosciences 1
article Computers &amp; Mathematics with Applications 1
article Computers, Materials &amp; Continua 1
article Digital Communications and Networks 1
article Electronics 1
article Expert Systems 1
article Future Generation Computer Systems 1
article IAENG International Journal of Computer Science 1
article ICIC Express Letters 1
article IEEE MultiMedia 1
article IEEE Transactions on Engineering Management 1
article IEEE Transactions on Knowledge and Data Engineering 1
article IEEE Transactions on Neural Networks and Learning Systems 1
article International Journal of Computational Intelligence Systems 1
article International Journal of Information Technology 1
article International Journal of Intelligent Engineering and Systems 1
article International Journal on Artificial Intelligence Tools 1
article Journal of Ambient Intelligence and Humanized Computing 1
article Journal of Artificial Intelligence Research 1
article Journal of Biomedical Informatics 1
article Knowledge-Based Systems 1
article Machine Learning 1
article Multimedia Systems 1
article Neural Networks 1
article Pattern Recognition and Image Analysis 1
article Procedia Computer Science 1
article Proceedings of the AAAI Conference on Artificial Intelligence 1
article Procesamiento del Lenguaje Natural 1
article Signal, Image and Video Processing 1
article Soft Computing 1
article Swarm and Evolutionary Computation 1
inbook Communications in Computer and Information Science 1
inbook Frontiers in Artificial Intelligence and Applications 1
inbook Lecture Notes in Artificial Intelligence (Subseries of Lecture Notes in Computer Science) 1
inproceedings 2009 IEEE International Workshop on Multimedia Signal Processing, MMSP ’09 1
inproceedings 2010 International Conference on Web Information Systems and Mining 1
inproceedings 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL HLT 2016 - Proceedings of the Conference 1
inproceedings 2018 IEEE Congress on Evolutionary Computation, CEC 2018 - Proceedings 1
inproceedings 2022 10th International Conference on Affective Computing and Intelligent Interaction, ACII 2022 1
inproceedings 50th Annual Meeting of the Association for Computational Linguistics, ACL 2012 - Proceedings of the Conference 1
inproceedings 6th International Conference on Fuzzy Systems and Knowledge Discovery, FSKD 2009 1
inproceedings ACM International Conference Proceeding Series 1
inproceedings ACM Web Conference 2023 - Proceedings of the World Wide Web Conference, WWW 2023 1
inproceedings ACM-BCB 2016 - 7th ACM Conference on Bioinformatics, Computational Biology, and Health Informatics 1
inproceedings Advances in Neural Information Processing Systems 1
inproceedings Advances in Neural Information Processing Systems 22 - Proceedings of the 2009 Conference 1
inproceedings EACL 2021 - 16th Conference of the European Chapter of the Association for Computational Linguistics, Proceedings of the Conference 1
inproceedings ECNLP 2022 - 5th Workshop on e-Commerce and NLP, Proceedings of the Workshop 1
inproceedings EMNLP 2020 - 2020 Conference on Empirical Methods in Natural Language Processing, Proceedings of the Conference 1
inproceedings Findings of the Association for Computational Linguistics: EMNLP 2023 1
inproceedings Frontiers in Artificial Intelligence and Applications 1
inproceedings ICETC 2010 - 2010 2nd International Conference on Education Technology and Computer 1
inproceedings MML’10 - Proceedings of the 3rd ACM International Workshop on Machine Learning and Music, Co-located with ACM Multimedia 2010 1
inproceedings NAACL HLT 2019 - 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies - Proceedings of the Conference 1
inproceedings Proceedings - 2009 International Conference on Industrial and Information Systems, IIS 2009 1
inproceedings Proceedings - 2013 IEEE International Conference on Big Data, Big Data 2013 1
inproceedings Proceedings - 2015 IEEE 16th International Conference on Information Reuse and Integration, IRI 2015 1
inproceedings Proceedings - 2018 IEEE International Conference on Cognitive Computing, ICCC 2018 - Part of the 2018 IEEE World Congress on Services 1
inproceedings Proceedings - IEEE International Conference on Data Mining Workshops, ICDM Workshops 2008 1
inproceedings Proceedings - The 1st International Conference on Intelligent Networks and Intelligent Systems, ICINIS 2008 1
inproceedings Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume 1
inproceedings Proceedings of the 29th International Conference on Machine Learning, ICML 2012 1
inproceedings Proceedings of the 29th USENIX Security Symposium 1
inproceedings Proceedings of the 3rd ACM India Joint International Conference on Data Science & Management of Data (8th ACM IKDD CODS & 26th COMAD) 1
inproceedings Proceedings of the AAAI Conference on Artificial Intelligence 1
inproceedings Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining 1
inproceedings Proceedings of the ACM Symposium on Applied Computing 1
inproceedings Proceedings of the Annual Meeting of the Association for Computational Linguistics 1
inproceedings Proceedings of the IEEE International Conference on Computer Vision 1
inproceedings Proceedings of the International Conference on Document Analysis and Recognition, ICDAR 1
inproceedings Proceedings of the International Joint Conference on Neural Networks 1
inproceedings SIGIR 2010 Proceedings - 33rd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval 1
inproceedings SIGIR’12 - Proceedings of the International ACM SIGIR Conference on Research and Development in Information Retrieval 1
inproceedings SemEval 2022 - 16th International Workshop on Semantic Evaluation, Proceedings of the Workshop 1
inproceedings WCRML 2019 - Proceedings of the ACM Workshop on Crossmodal Learning and Application 1
# Define the vectors
# Multimodal
MULTIMOAL_BIBTEXKEY <- c(
  "ma2021",
  "xianfang2024",
  "reil2023",
  "fujinumay2023",
  "hessel2020",
  "bakkalis2023",
  "chenz2023",
  "zouh2023",
  "chenl2022",
  "gallo2021",
  "anget2018",
  "rajendran2016",
  "ghorbanali2024",
  "jiangs2024",
  "liub2024",
  "liut2024",
  "ronghaop2024",
  "tengfeil2024",
  "yushili2024",
  "zhangy2024",
  "arlqaraleshs2024",
  "guod2023",
  "jarquin2023",
  "jarrahia2023",
  "kenny2023",
  "liangz2023",
  "linckere2023",
  "luzdearau2023",
  "ortizperez2023",
  "rasheeda2023",
  "shah2023",
  "dacosta2022",
  "dongpin2022",
  "kanchid2022",
  "paraskevopoulos2022",
  "sapeao2022",
  "wangq2022",
  "garg2021",
  "guelorget2021",
  "setiawan2021",
  "wang2021",
  "zingaro2021",
  "braz2020",
  "debreuij2020",
  "zhu2020",
  "jainr2019",
  "ravikiranm2019",
  "matricm2018",
  "tellez2018",
  "schmittm2017",
  "cristanim2014",
  "liparas2014",
  "perezgraciat2010",
  "zhangx2010",
  "chens2009",
  "chos2023",
  "adwaithd2022",
  "wajdm2024",
  "kozienkop2023",
  "carmona2020",
  "argon2018",
  "guptad2018",
  "guq2022",
  "chatziagapia2022",
  "yuet2022",
  "akhiamov2018",
  "akhtiamovo2017",
  "andriyanovn2022",
  "jiangs2024",
  "anget2018"
)

# Multiview
MULTIVIEW_BIBTEXKEY <- c(
  "brefeldu2015",
  "aminim2009",
  "liy2013",
  "zhangqi2024",
  "doinychko2020",
  "samya2023",
  "sangy2022",
  "jiax2021",
  "liang2021",
  "sus2021",
  "yangp2014",
  "maf2020",
  "max2020",
  "wangh2020",
  "bhatt2019",
  "chens2019",
  "hoylea2019",
  "wangh2019",
  "ferreira2018",
  "zhup2018",
  "zhanz2017",
  "xux2016",
  "perinaa2013",
  "zhangb2013",
  "zhangd2013",
  "guyo2012",
  "kovesim2012",
  "yangp2012",
  "zhengw2011",
  "chenb2009",
  "zhangx2009",
  "zhangb2008",
  "fengz2024",
  "jiz2024",
  "xuy2024",
  "varmanp2023",
  "zhao2023",
  "cgoncalves2022",
  "liuj2022",
  "luox2022",
  "gui2021",
  "huc2021",
  "liuw2021",
  "mmironczuk2020",
  "mmironczuk2019",
  "pengj2018",
  "huz2017",
  "sinorar2016",
  "xuh2016",
  "fakri2015",
  "liuj2014",
  "longg2013",
  "lig2012",
  "aminim2010",
  "aminim2010b",
  "suns2010",
  "zhangx2010b",
  "suns2008",
  "matsubara2005",
  "dasigiv2001",
  "graffm2023",
  "tianl2023",
  "karisanip2022",
  "lij2020",
  "zhang2021",
  "carmona2020",
  "rajendran2016",
  "liaox2015",
  "gup2009",
  "akhtiamov2019",
  "yangp2014",
  "hey2019",
  "xuc2017",
  "iglesias2016"
)
# Bibtex
ALL_BIBTEXKEY <- c(
  "dacosta2022",
  "gallo2021",
  "gui2021",
  "huc2021",
  "garg2021",
  "ma2021",
  "zingaro2021",
  "zhang2021",
  "wang2021",
  "liang2021",
  "guelorget2021",
  "sus2021",
  "zhao2023",
  "lij2020",
  "jiax2021",
  "maf2020",
  "max2020",
  "setiawan2021",
  "braz2020",
  "debreuij2020",
  "hessel2020",
  "doinychko2020",
  "carmona2020",
  "zhu2020",
  "bhatt2019",
  "hey2019",
  "wangh2020",
  "jainr2019",
  "ravikiranm2019",
  "mmironczuk2019",
  "chens2019",
  "wangh2019",
  "hoylea2019",
  "akhtiamov2019",
  "anget2018",
  "mmironczuk2020",
  "zhup2018",
  "matricm2018",
  "tellez2018",
  "akhiamov2018",
  "guptad2018",
  "ferreira2018",
  "argon2018",
  "pengj2018",
  "xuc2017",
  "schmittm2017",
  "zhanz2017",
  "akhtiamovo2017",
  "huz2017",
  "xuh2016",
  "sinorar2016",
  "xux2016",
  "rajendran2016",
  "iglesias2016",
  "fakri2015",
  "brefeldu2015",
  "liaox2015",
  "liparas2014",
  "liuj2014",
  "cristanim2014",
  "zhangd2013",
  "longg2013",
  "perinaa2013",
  "liy2013",
  "zhangb2013",
  "lig2012",
  "yangp2012",
  "kovesim2012",
  "guyo2012",
  "zhengw2011",
  "zhangx2010",
  "perezgraciat2010",
  "aminim2010b",
  "suns2010",
  "zhangx2010b",
  "aminim2010",
  "chens2009",
  "aminim2009",
  "chenb2009",
  "zhangx2009",
  "gup2009",
  "suns2008",
  "zhangb2008",
  "matsubara2005",
  "dasigiv2001",
  "ronghaop2024",
  "tengfeil2024",
  "liub2024",
  "zhangy2024",
  "xianfang2024",
  "zhangqi2024",
  "yushili2024",
  "ghorbanali2024",
  "jiz2024",
  "xuy2024",
  "fengz2024",
  "wajdm2024",
  "bakkalis2023",
  "zouh2023",
  "chenz2023",
  "luzdearau2023",
  "jarquin2023",
  "linckere2023",
  "kenny2023",
  "tianl2023",
  "varmanp2023",
  "graffm2023",
  "kozienkop2023",
  "liangz2023",
  "jarrahia2023",
  "samya2023",
  "liut2024",
  "rasheeda2023",
  "shah2023",
  "fujinumay2023",
  "chos2023",
  "ortizperez2023",
  "chenl2022",
  "dongpin2022",
  "guq2022",
  "yuet2022",
  "karisanip2022",
  "reil2023",
  "arlqaraleshs2024",
  "sapeao2022",
  "paraskevopoulos2022",
  "sangy2022",
  "liuj2022",
  "adwaithd2022",
  "kanchid2022",
  "luox2022",
  "jiangs2024",
  "wangq2022",
  "andriyanovn2022",
  "chatziagapia2022",
  "cgoncalves2022",
  "guod2023",
  "liuw2021",
  "yangp2014"
)
# Print basic information
unique(c(MULTIMOAL_BIBTEXKEY, MULTIVIEW_BIBTEXKEY))
##   [1] "ma2021"              "xianfang2024"        "reil2023"           
##   [4] "fujinumay2023"       "hessel2020"          "bakkalis2023"       
##   [7] "chenz2023"           "zouh2023"            "chenl2022"          
##  [10] "gallo2021"           "anget2018"           "rajendran2016"      
##  [13] "ghorbanali2024"      "jiangs2024"          "liub2024"           
##  [16] "liut2024"            "ronghaop2024"        "tengfeil2024"       
##  [19] "yushili2024"         "zhangy2024"          "arlqaraleshs2024"   
##  [22] "guod2023"            "jarquin2023"         "jarrahia2023"       
##  [25] "kenny2023"           "liangz2023"          "linckere2023"       
##  [28] "luzdearau2023"       "ortizperez2023"      "rasheeda2023"       
##  [31] "shah2023"            "dacosta2022"         "dongpin2022"        
##  [34] "kanchid2022"         "paraskevopoulos2022" "sapeao2022"         
##  [37] "wangq2022"           "garg2021"            "guelorget2021"      
##  [40] "setiawan2021"        "wang2021"            "zingaro2021"        
##  [43] "braz2020"            "debreuij2020"        "zhu2020"            
##  [46] "jainr2019"           "ravikiranm2019"      "matricm2018"        
##  [49] "tellez2018"          "schmittm2017"        "cristanim2014"      
##  [52] "liparas2014"         "perezgraciat2010"    "zhangx2010"         
##  [55] "chens2009"           "chos2023"            "adwaithd2022"       
##  [58] "wajdm2024"           "kozienkop2023"       "carmona2020"        
##  [61] "argon2018"           "guptad2018"          "guq2022"            
##  [64] "chatziagapia2022"    "yuet2022"            "akhiamov2018"       
##  [67] "akhtiamovo2017"      "andriyanovn2022"     "brefeldu2015"       
##  [70] "aminim2009"          "liy2013"             "zhangqi2024"        
##  [73] "doinychko2020"       "samya2023"           "sangy2022"          
##  [76] "jiax2021"            "liang2021"           "sus2021"            
##  [79] "yangp2014"           "maf2020"             "max2020"            
##  [82] "wangh2020"           "bhatt2019"           "chens2019"          
##  [85] "hoylea2019"          "wangh2019"           "ferreira2018"       
##  [88] "zhup2018"            "zhanz2017"           "xux2016"            
##  [91] "perinaa2013"         "zhangb2013"          "zhangd2013"         
##  [94] "guyo2012"            "kovesim2012"         "yangp2012"          
##  [97] "zhengw2011"          "chenb2009"           "zhangx2009"         
## [100] "zhangb2008"          "fengz2024"           "jiz2024"            
## [103] "xuy2024"             "varmanp2023"         "zhao2023"           
## [106] "cgoncalves2022"      "liuj2022"            "luox2022"           
## [109] "gui2021"             "huc2021"             "liuw2021"           
## [112] "mmironczuk2020"      "mmironczuk2019"      "pengj2018"          
## [115] "huz2017"             "sinorar2016"         "xuh2016"            
## [118] "fakri2015"           "liuj2014"            "longg2013"          
## [121] "lig2012"             "aminim2010"          "aminim2010b"        
## [124] "suns2010"            "zhangx2010b"         "suns2008"           
## [127] "matsubara2005"       "dasigiv2001"         "graffm2023"         
## [130] "tianl2023"           "karisanip2022"       "lij2020"            
## [133] "zhang2021"           "liaox2015"           "gup2009"            
## [136] "akhtiamov2019"       "hey2019"             "xuc2017"            
## [139] "iglesias2016"
length(unique(c(
  MULTIMOAL_BIBTEXKEY, MULTIVIEW_BIBTEXKEY
))) == length(ALL_BIBTEXKEY)
## [1] TRUE
setdiff(unique(c(
  MULTIMOAL_BIBTEXKEY, MULTIVIEW_BIBTEXKEY
)), ALL_BIBTEXKEY)
## character(0)
setdiff(ALL_BIBTEXKEY, unique(c(
  MULTIMOAL_BIBTEXKEY, MULTIVIEW_BIBTEXKEY
)))
## character(0)
names(table(unique(sort(excel_df$bibtexkey)))) == unique(sort(ALL_BIBTEXKEY))
##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [61] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [76] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [91] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [106] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [136] TRUE TRUE TRUE TRUE
GG_PLOTS <- c()
# Count papers in each group
multimodal_count <- length(unique(MULTIMOAL_BIBTEXKEY))
multiview_count <- length(unique(MULTIVIEW_BIBTEXKEY))

# Count papers in both categories (overlap)
papers_in_both <- length(intersect(MULTIMOAL_BIBTEXKEY, MULTIVIEW_BIBTEXKEY))

# Count papers exclusive to each group
multimodal_only <- multimodal_count - papers_in_both
multiview_only <- multiview_count - papers_in_both


# Define the short categories
categories <- c("multimodal", "multiview", "both")

# Create a data frame for the pie chart
paper_counts <- data.frame(
  group = c("multimodal", "multiview", "both"),
  count = c(multimodal_only, multiview_only, papers_in_both)
)

# Calculate percentages
paper_counts <- paper_counts %>%
  mutate(
    percentage = round(count / sum(count) * 100, 1),
    label = paste0(count, " (", percentage, "%)"),
    group   = factor(group, levels = categories)
  )

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(3))(3),
                                  categories)

# Create pie chart with ggplot2
ggplot_chart <- create_pie_chart(
  paper_counts,
  title = "",
  subtitle = "",
  global_palette = global_palette
)
GG_PLOTS <- append(GG_PLOTS, ggplot_chart)

ggsave(
  "fig-mm-mv-both-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)
ggplot_chart

knitr::kable(paper_counts, caption = "Paper Distribution by Category") %>% kableExtra::kable_styling()
Table 2.3: Table 2.4: Paper Distribution by Category
group count percentage label
multimodal 66 47.5 66 (47.5%)
multiview 71 51.1 71 (51.1%)
both 2 1.4 2 (1.4%)
year_group_counts <- excel_df %>%
  mutate(bibtexkey = tolower(bibtexkey)) %>%
  inner_join(
    bib_df %>% mutate(BIBTEXKEY = tolower(BIBTEXKEY)) %>% select('BIBTEXKEY', 'YEAR'),
    by = c("bibtexkey" = "BIBTEXKEY")
  ) %>%
  mutate(
    in_mm = bibtexkey %in% tolower(MULTIMOAL_BIBTEXKEY),
    in_mv = bibtexkey %in% tolower(MULTIVIEW_BIBTEXKEY),
    group = case_when(
      in_mm & !in_mv ~ "multimodal",
      in_mv & !in_mm ~ "multiview",
      in_mm & in_mv  ~ "both",
      TRUE           ~ NA_character_
    )
  ) %>%
  filter(!is.na(group)) %>%
  count(YEAR, group, name = "n") %>%
  mutate(group = factor(group, levels = categories))

bubble_plot <- create_bubble_plot(
  year_group_counts,
  title = "",
  subtitle = "",
  global_palette = global_palette
)

GG_PLOTS <- append(GG_PLOTS, bubble_plot)
paste("Multimodal pappers: ", sum(year_group_counts$n[year_group_counts$group == "multimodal"]))
## [1] "Multimodal pappers:  66"
paste("Multiview pappers: ", sum(year_group_counts$n[year_group_counts$group == "multiview"]))
## [1] "Multiview pappers:  71"
overlap_keys <- intersect(MULTIMOAL_BIBTEXKEY, MULTIVIEW_BIBTEXKEY)
print(overlap_keys)
## [1] "rajendran2016" "carmona2020"
bubble_plot

# 1. Wrap the title at ~60 characters per line
# Pie chart of the distribution of works on multimodal and multi-view learning (left chart) and the distribution of the number of these works by year (right figure)
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

# 2. Create a centered title grob
title_grob <- grid::textGrob(
  wrapped_title,
  gp = grid::gpar(fontsize = 12, fontface = "plain"),
  just = "center"  # center align
)

# 3. Draw: give a bit more room to the title (e.g. 1:10 height ratio)
combined_plot <- gridExtra::grid.arrange(
  title_grob,
  gridExtra::arrangeGrob(ggplot_chart, bubble_plot, ncol = 2),
  ncol   = 1,
  heights = c(1, 10)
)

ggsave(
  "fig-mm-mv-both-pie-bubble.pdf",
  plot = combined_plot,
  width = 14,
  height = 8
)

3 Multimodal reserch papper analysis

multimodal_df <- excel_df %>%
  filter(bibtexkey %in% MULTIMOAL_BIBTEXKEY)

head(multimodal_df)
## # A tibble: 6 x 21
##     l.p doi        bibtexkey `Article title` `Short note` Please summarize or ~1
##   <dbl> <chr>      <chr>     <chr>           <chr>        <chr>                 
## 1     1 10.1016/j~ dacosta2~ "Providing a g~ Multi-modal~ "The article \"Provid~
## 2     2 10.1007/9~ gallo2021 "Visual Word E~ The study f~ "The paper \"Visual W~
## 3     5 10.1145/3~ garg2021  "On-Device Doc~ The study u~ "The paper \"On-Devic~
## 4     6 10.18653/~ ma2021    "On the (in)ef~ The study i~ "The paper \"On the (~
## 5     7 10.1109/I~ zingaro2~ "Multimodal si~ The study f~ "The paper \"Multimod~
## 6     9 10.1109/M~ wang2021  "Implicit Emot~ The study f~ "The article titled \~
## # i abbreviated name:
## #   1: `Please summarize or provide the most important details of the work. Use a maximum of five sentences.`
## # i 15 more variables: `What are the findings?` <chr>,
## #   `What are the challenges?` <chr>,
## #   `Identified the datasets used in the article` <chr>,
## #   `Disambiguated datasets names` <chr>,
## #   `What other models were selected for comparison?` <chr>, ...
paste("Extracted", nrow(multimodal_df), "multimodal papers out of", nrow(excel_df), "total papers")
## [1] "Extracted 68 multimodal papers out of 139 total papers"

3.1 Topics

year_multimodal_df <- multimodal_df %>%
  mutate(bibtexkey = tolower(bibtexkey)) %>%
  inner_join(
    bib_df %>% mutate(BIBTEXKEY = tolower(BIBTEXKEY)) %>% select('BIBTEXKEY', 'YEAR'),
    by = c("bibtexkey" = "BIBTEXKEY")
  )

3.1.1 General Framework

general_framework <- list()
general_framework[['When to use multi-modal learning']] <-  c('ma2021')
general_framework[['Feature selection as a multi-modal optimization problem']] <-   c('xianfang2024')
general_framework[['Handling missing views or modalities']] <-  c('reil2023')
general_framework[['Evaluating multi-modal model performance']] <-  c('fujinumay2023, hessel2020') 
general_framework[['Multimodal representation learning']] <-    c('chenz2023, zouh2023, gallo2021, rajendran2016')

summary_df <- create_bubble_df(year_multimodal_df, general_framework)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(5))(5),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "General Framework",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "General Framework") %>% kableExtra::kable_styling()
Table 3.1: Table 3.2: General Framework
group YEAR n
2 Evaluating multi-modal model performance 2020 1
6 Evaluating multi-modal model performance 2023 1
8 Feature selection as a multi-modal optimization problem 2024 1
5 Handling missing views or modalities 2022 1
1 Multimodal representation learning 2016 1
3 Multimodal representation learning 2021 1
7 Multimodal representation learning 2023 2
4 When to use multi-modal learning 2021 1
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "General Framework") %>% kableExtra::kable_styling()
Table 3.3: Table 3.4: General Framework
group count percentage label column
Evaluating multi-modal model performance 2 22.2 2 (22.2%) group
Feature selection as a multi-modal optimization problem 1 11.1 1 (11.1%) group
Handling missing views or modalities 1 11.1 1 (11.1%) group
Multimodal representation learning 4 44.4 4 (44.4%) group
When to use multi-modal learning 1 11.1 1 (11.1%) group
# 1. Wrap the title at ~60 characters per line
# Treemap of the Distribution of Works on Multimodal–General Framework (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mm-gen-frem-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

3.1.2 Task

tasks <- list()
tasks[['Finance']] <- c('chos2023')
tasks[['Healthcare']] <- c('chenz2023, ortizperez2023, braz2020')
tasks[['Disaster response/managment']] <- c('arlqaraleshs2024, adwaithd2022, dacosta2022, debreuij2020')
tasks[['Air traffic control']] <- c('guod2023')
tasks[['Cultural heritage']] <- c('reil2023, perezgraciat2010')
tasks[['Education']] <- c('linckere2023, sapeao2022')
tasks[['E-commerce']] <- c('wajdm2024, chenl2022')
tasks[['Industrial fault diagnosis']] <- c('dongpin2022')
tasks[['Social Network']] <- c('shah2023')
tasks[['Social media analysis']] <- c('yushili2024, jarquin2023, kozienkop2023, guelorget2021, carmona2020, argon2018, guptad2018')
tasks[['Fake news detection']] <- c('jarrahia2023, liangz2023')
tasks[['Hate speech and offensive language detection']] <- c('kozienkop2023, guq2022')
tasks[['Emotion recognition']] <- c('ronghaop2024, kenny2023, wang2021, schmittm2017')
tasks[['Author profiling']] <- c('carmona2020, argon2018, matricm2018, tellez2018, cristanim2014')
tasks[['Document image classification']] <- c('jiangs2024, bakkalis2023, chos2023, fujinumay2023, luzdearau2023, rasheeda2023, kanchid2022, garg2021, zingaro2021, jainr2019')
#tasks[['Video classification']] <- c('')
tasks[['Speaker role identification']] <- c('guod2023')
tasks[['Sentiment analysis']] <- c('ghorbanali2024, liub2024, zhangy2024, setiawan2021, anget2018')
tasks[['Filled Pause Detection']] <- c('chatziagapia2022')
tasks[['Safety-Report Observations']] <- c('paraskevopoulos2022')
tasks[['Malware Text Classification']] <- c('ravikiranm2019')
tasks[['Research paper classification']] <- c('liut2024, yuet2022')
tasks[['Web document classification']] <- c('ma2021, liparas2014, zhangx2010')

summary_df <- create_bubble_df(year_multimodal_df, tasks)

base <- ggsci::pal_jco("default")(10)
extra <- c(lighten(base, 0.3), darken(base, 0.2))
palette_23 <- c(base, extra)[1:23]
global_palette <- stats::setNames(palette_23, unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Tasks",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Tasks") %>% kableExtra::kable_styling()
Table 3.5: Table 3.6: Tasks
group YEAR n
31 Air traffic control 2023 1
3 Author profiling 2014 1
6 Author profiling 2018 3
11 Author profiling 2020 1
1 Cultural heritage 2010 1
20 Cultural heritage 2022 1
12 Disaster response/managment 2020 1
21 Disaster response/managment 2022 2
32 Disaster response/managment 2023 1
9 Document image classification 2019 1
15 Document image classification 2021 2
22 Document image classification 2022 3
33 Document image classification 2023 3
44 Document image classification 2024 1
23 E-commerce 2022 1
34 E-commerce 2023 1
24 Education 2022 1
35 Education 2023 1
5 Emotion recognition 2017 1
16 Emotion recognition 2021 1
36 Emotion recognition 2023 1
45 Emotion recognition 2024 1
25 Fake news detection 2022 1
37 Fake news detection 2023 1
26 Filled Pause Detection 2022 1
38 Finance 2023 1
27 Hate speech and offensive language detection 2022 1
39 Hate speech and offensive language detection 2023 1
13 Healthcare 2020 1
40 Healthcare 2023 2
28 Industrial fault diagnosis 2022 1
10 Malware Text Classification 2019 1
29 Research paper classification 2022 1
46 Research paper classification 2024 1
30 Safety-Report Observations 2022 1
7 Sentiment analysis 2018 1
17 Sentiment analysis 2021 1
47 Sentiment analysis 2024 3
8 Social media analysis 2018 2
14 Social media analysis 2020 1
18 Social media analysis 2021 1
41 Social media analysis 2023 2
48 Social media analysis 2024 1
42 Social Network 2023 1
43 Speaker role identification 2023 1
2 Web document classification 2010 1
4 Web document classification 2014 1
19 Web document classification 2021 1
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Tasks") %>% kableExtra::kable_styling()
Table 3.7: Table 3.8: Tasks
group count percentage label column
Air traffic control 1 1.6 1 (1.6%) group
Author profiling 5 8.2 5 (8.2%) group
Cultural heritage 2 3.3 2 (3.3%) group
Disaster response/managment 4 6.6 4 (6.6%) group
Document image classification 10 16.4 10 (16.4%) group
E-commerce 2 3.3 2 (3.3%) group
Education 2 3.3 2 (3.3%) group
Emotion recognition 4 6.6 4 (6.6%) group
Fake news detection 2 3.3 2 (3.3%) group
Filled Pause Detection 1 1.6 1 (1.6%) group
Finance 1 1.6 1 (1.6%) group
Hate speech and offensive language detection 2 3.3 2 (3.3%) group
Healthcare 3 4.9 3 (4.9%) group
Industrial fault diagnosis 1 1.6 1 (1.6%) group
Malware Text Classification 1 1.6 1 (1.6%) group
Research paper classification 2 3.3 2 (3.3%) group
Safety-Report Observations 1 1.6 1 (1.6%) group
Sentiment analysis 5 8.2 5 (8.2%) group
Social Network 1 1.6 1 (1.6%) group
Social media analysis 7 11.5 7 (11.5%) group
Speaker role identification 1 1.6 1 (1.6%) group
Web document classification 3 4.9 3 (4.9%) group
# 1. Wrap the title at ~60 characters per line
# Treemap of the Distribution of Works on Multimodal–Task (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mm-task-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

3.1.3 Data Modalities

data_modalities <- list()
data_modalities[["Text + Image"]] <-    c('ghorbanali2024, liub2024, liut2024, tengfeil2024, wajdm2024, yushili2024, zhangy2024, arlqaraleshs2024, bakkalis2023, chenz2023, chos2023, jarquin2023, liangz2023, linckere2023, luzdearau2023, rasheeda2023, zouh2023, adwaithd2022, andriyanovn2022, chenl2022, kanchid2022, paraskevopoulos2022, wangq2022, yuet2022, gallo2021, garg2021, guelorget2021, ma2021, setiawan2021, wang2021, zingaro2021, braz2020, carmona2020, hessel2020, jainr2019, ravikiranm2019, argon2018, guptad2018, matricm2018, tellez2018, rajendran2016, cristanim2014, liparas2014, zhangx2010, chens2009')
data_modalities[["Text + Audio"]] <-    c('ronghaop2024, guod2023, ortizperez2023, chatziagapia2022, sapeao2022, zhu2020, akhtiamovo2017')
data_modalities[["Symbolic Music + Metadata"]] <-   c('perezgraciat2010')
data_modalities[["Text + Metadata"]] <- c('jarrahia2023', 'shah2023, kozienkop2023, dacosta2022')
data_modalities[["Text + Image + Audio"]] <-    c('akhiamov2018, schmittm2017')
data_modalities[["Text + Motion Capture + Audio"]] <-   c('kenny2023')
data_modalities[["Text + Time series"]] <-  c('dongpin2022, debreuij2020, anget2018')
data_modalities[["Text + Image + Metadata"]] <- c('jiangs2024, fujinumay2023, reil2023, guq2022')

summary_df <- create_bubble_df(year_multimodal_df, data_modalities)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(8))(8),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Data Modalities (combinations of text, image, audio)",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Data Modalities (combinations of text, image, audio)") %>% kableExtra::kable_styling()
Table 3.9: Table 3.10: Data Modalities (combinations of text, image, audio)
group YEAR n
2 Symbolic Music + Metadata 2010 1
6 Text + Audio 2017 1
12 Text + Audio 2020 1
16 Text + Audio 2022 2
21 Text + Audio 2023 2
26 Text + Audio 2024 1
1 Text + Image 2009 1
3 Text + Image 2010 1
4 Text + Image 2014 2
5 Text + Image 2016 1
8 Text + Image 2018 4
11 Text + Image 2019 2
13 Text + Image 2020 3
15 Text + Image 2021 7
17 Text + Image 2022 9
22 Text + Image 2023 9
27 Text + Image 2024 6
7 Text + Image + Audio 2017 1
9 Text + Image + Audio 2018 1
18 Text + Image + Metadata 2022 2
23 Text + Image + Metadata 2023 1
28 Text + Image + Metadata 2024 1
19 Text + Metadata 2022 2
24 Text + Metadata 2023 2
25 Text + Motion Capture + Audio 2023 1
10 Text + Time series 2018 1
14 Text + Time series 2020 1
20 Text + Time series 2022 1
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Data Modalities (combinations of text, image, audio)") %>% kableExtra::kable_styling()
Table 3.11: Table 3.12: Data Modalities (combinations of text, image, audio)
group count percentage label column
Symbolic Music + Metadata 1 1.5 1 (1.5%) group
Text + Audio 7 10.4 7 (10.4%) group
Text + Image 45 67.2 45 (67.2%) group
Text + Image + Audio 2 3.0 2 (3%) group
Text + Image + Metadata 4 6.0 4 (6%) group
Text + Metadata 4 6.0 4 (6%) group
Text + Motion Capture + Audio 1 1.5 1 (1.5%) group
Text + Time series 3 4.5 3 (4.5%) group
# 1. Wrap the title
# Treemap of the Distribution of Works on Multimodal–Data Modalities (combinations of text, image, audio) (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 70
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mm-data-mod-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

3.1.4 Type of Classification Task

classification_tasks <- list()
classification_tasks[['Binary Classification']] <- c('guod2023, jarquin2023, jarrahia2023, kozienkop2023, liangz2023, ortizperez2023, shah2023, adwaithd2022, chatziagapia2022, dacosta2022, guq2022, braz2020, debreuij2020, ravikiranm2019, akhiamov2018, argon2018, guptad2018, matricm2018, tellez2018, akhtiamovo2017')
classification_tasks[['Multi-class Classification']] <- c('ghorbanali2024, jiangs2024, liub2024, liut2024, ronghaop2024, tengfeil2024, wajdm2024, yushili2024, zhangy2024, bakkalis2023, chenz2023, chos2023, jarquin2023, kenny2023, linckere2023, luzdearau2023, reil2023, zouh2023, adwaithd2022, andriyanovn2022, chenl2022, dongpin2022, kanchid2022, paraskevopoulos2022, sapeao2022, wangq2022, yuet2022, gallo2021, garg2021, setiawan2021, wang2021, zingaro2021, carmona2020, hessel2020, zhu2020, jainr2019, anget2018, schmittm2017, rajendran2016, cristanim2014, liparas2014, perezgraciat2010, zhangx2010, chens2009')
classification_tasks[['Multi-label Classification']] <- c('fujinumay2023, guq2022, guelorget2021, ma2021')


summary_df <- create_bubble_df(year_multimodal_df, classification_tasks)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(6))(6),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Type of Classification Task",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Type of Classification Task") %>% kableExtra::kable_styling()
Table 3.13: Table 3.14: Type of Classification Task
group YEAR n
5 Binary Classification 2017 1
7 Binary Classification 2018 5
9 Binary Classification 2019 1
11 Binary Classification 2020 2
15 Binary Classification 2022 5
18 Binary Classification 2023 6
1 Multi-class Classification 2009 1
2 Multi-class Classification 2010 2
3 Multi-class Classification 2014 2
4 Multi-class Classification 2016 1
6 Multi-class Classification 2017 1
8 Multi-class Classification 2018 1
10 Multi-class Classification 2019 1
12 Multi-class Classification 2020 3
13 Multi-class Classification 2021 5
16 Multi-class Classification 2022 11
19 Multi-class Classification 2023 8
21 Multi-class Classification 2024 8
14 Multi-label Classification 2021 2
17 Multi-label Classification 2022 1
20 Multi-label Classification 2023 1
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Type of Classification Task") %>% kableExtra::kable_styling()
Table 3.15: Table 3.16: Type of Classification Task
group count percentage label column
Binary Classification 20 29.4 20 (29.4%) group
Multi-class Classification 44 64.7 44 (64.7%) group
Multi-label Classification 4 5.9 4 (5.9%) group
# 1. Wrap the title
# Treemap of the Distribution of Works on Type of Classification Task (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 70
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mm-class-task-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

3.1.5 Learning Paradigms

learning_paradigms <- list()
learning_paradigms[['Supervised Learning']] <- c('ghorbanali2024, jiangs2024, liub2024, liut2024, tengfeil2024, wajdm2024, yushili2024,  chenz2023, guod2023, jarquin2023, jarrahia2023, kenny2023, kozienkop2023, liangz2023, linckere2023, luzdearau2023, ortizperez2023, rasheeda2023, shah2023, adwaithd2022, andriyanovn2022, chatziagapia2022, dacosta2022, dongpin2022, guq2022, sapeao2022, yuet2022, gallo2021, ma2021, setiawan2021, wang2021, garg2021, braz2020, carmona2020, debreuij2020, hessel2020, jainr2019, ravikiranm2019, akhiamov2018, argon2018, matricm2018, tellez2018, akhtiamovo2017, schmittm2017, cristanim2014, liparas2014, perezgraciat2010, zhangx2010, chens2009')
learning_paradigms[['Semi-supervised Learning']] <- c('chenl2022, anget2018, guptad2018')
learning_paradigms[['Active Learning']] <- c('guelorget2021')
learning_paradigms[['Transfer Learning']] <- c('ghorbanali2024, ronghaop2024, chenz2023, chos2023, fujinumay2023, chatziagapia2022, kanchid2022, wangq2022, zingaro2021, rajendran2016')
learning_paradigms[['Contrastive Learning']] <- c('bakkalis2023, zouh2023,  chenl2022, paraskevopoulos2022')
learning_paradigms[['Multi-task Learning']] <- c('reil2023')


summary_df <- create_bubble_df(year_multimodal_df, learning_paradigms)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(6))(6),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Learning Paradigms",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Learning Paradigms") %>% kableExtra::kable_styling()
Table 3.17: Table 3.18: Learning Paradigms
group YEAR n
10 Active Learning 2021 1
13 Contrastive Learning 2022 2
18 Contrastive Learning 2023 2
14 Multi-task Learning 2022 1
6 Semi-supervised Learning 2018 2
15 Semi-supervised Learning 2022 1
1 Supervised Learning 2009 1
2 Supervised Learning 2010 2
3 Supervised Learning 2014 2
5 Supervised Learning 2017 2
7 Supervised Learning 2018 4
8 Supervised Learning 2019 2
9 Supervised Learning 2020 4
11 Supervised Learning 2021 5
16 Supervised Learning 2022 11
19 Supervised Learning 2023 10
21 Supervised Learning 2024 6
4 Transfer Learning 2016 1
12 Transfer Learning 2021 1
17 Transfer Learning 2022 3
20 Transfer Learning 2023 3
22 Transfer Learning 2024 2
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Learning Paradigms") %>% kableExtra::kable_styling()
Table 3.19: Table 3.20: Learning Paradigms
group count percentage label column
Active Learning 1 1.5 1 (1.5%) group
Contrastive Learning 4 5.9 4 (5.9%) group
Multi-task Learning 1 1.5 1 (1.5%) group
Semi-supervised Learning 3 4.4 3 (4.4%) group
Supervised Learning 49 72.1 49 (72.1%) group
Transfer Learning 10 14.7 10 (14.7%) group
# 1. Wrap the title
# Treemap of the Distribution of Works on Learning Paradigms (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mm-learn-para-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

3.1.6 Fusion Strategies

fusion_strategies <- list()
fusion_strategies[['Early Fusion']] <- c('arlqaraleshs2024, jiangs2024, liut2024, ronghaop2024, tengfeil2024, wajdm2024, zhangy2024, bakkalis2023, chos2023, guod2023, jarrahia2023, kenny2023, kozienkop2023, liangz2023, luzdearau2023, ortizperez2023, rasheeda2023, shah2023, adwaithd2022, dacosta2022, kanchid2022, paraskevopoulos2022, sapeao2022, wangq2022, yuet2022, gallo2021, guelorget2021, ma2021,  zingaro2021, braz2020, carmona2020, debreuij2020, zhu2020, jainr2019, ravikiranm2019, anget2018, argon2018, guptad2018, schmittm2017, cristanim2014, perezgraciat2010')
fusion_strategies[['Late Fusion']] <- c('arlqaraleshs2024, liub2024, yushili2024, reil2023, andriyanovn2022, chatziagapia2022, dacosta2022, dongpin2022, guq2022, wang2021, garg2021, setiawan2021, ravikiranm2019, matricm2018, tellez2018, liparas2014, zhangx2010, chens2009')
fusion_strategies[['Hybrid Fusion']] <- c('ghorbanali2024, chenz2023, fujinumay2023, linckere2023, zouh2023, dacosta2022, ravikiranm2019, akhiamov2018, akhtiamovo2017, rajendran2016')

summary_df <- create_bubble_df(year_multimodal_df, fusion_strategies)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(6))(6),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Fusion Strategies",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Fusion Strategies") %>% kableExtra::kable_styling()
Table 3.21: Table 3.22: Fusion Strategies
group YEAR n
2 Early Fusion 2010 1
4 Early Fusion 2014 1
7 Early Fusion 2017 1
9 Early Fusion 2018 3
12 Early Fusion 2019 2
15 Early Fusion 2020 4
16 Early Fusion 2021 4
18 Early Fusion 2022 10
21 Early Fusion 2023 10
24 Early Fusion 2024 5
6 Hybrid Fusion 2016 1
8 Hybrid Fusion 2017 1
10 Hybrid Fusion 2018 1
13 Hybrid Fusion 2019 1
19 Hybrid Fusion 2022 1
22 Hybrid Fusion 2023 4
25 Hybrid Fusion 2024 1
1 Late Fusion 2009 1
3 Late Fusion 2010 1
5 Late Fusion 2014 1
11 Late Fusion 2018 2
14 Late Fusion 2019 1
17 Late Fusion 2021 3
20 Late Fusion 2022 6
23 Late Fusion 2023 1
26 Late Fusion 2024 2
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Fusion Strategies") %>% kableExtra::kable_styling()
Table 3.23: Table 3.24: Fusion Strategies
group count percentage label column
Early Fusion 41 59.4 41 (59.4%) group
Hybrid Fusion 10 14.5 10 (14.5%) group
Late Fusion 18 26.1 18 (26.1%) group
# 1. Wrap the title
# Treemap of the Distribution of Works on Fusion Strategies (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 70
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mm-fusion-start-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

3.1.7 Fusion Techniques

fusion_techniques <- list()
fusion_techniques[['Machine Learning based Fusion']] <- c('reil2023, dacosta2022, wangq2022, setiawan2021, wang2021, carmona2020, akhiamov2018, argon2018, tellez2018, akhtiamovo2017, liparas2014, zhangx2010, chens2009')
fusion_techniques[['Probabilistic Fusion']] <- c('ghorbanali2024, dongpin2022, cristanim2014, perezgraciat2010')
fusion_techniques[['Neural Network Fusion']] <- c('jiangs2024, liub2024, liut2024, ronghaop2024, tengfeil2024, wajdm2024, yushili2024,   chos2023, fujinumay2023, jarquin2023, jarrahia2023, kenny2023, kozienkop2023, linckere2023, luzdearau2023, ortizperez2023, shah2023, zouh2023, adwaithd2022, chatziagapia2022, kanchid2022, paraskevopoulos2022, sapeao2022, yuet2022, gallo2021, garg2021, guelorget2021, ma2021, zingaro2021, braz2020, debreuij2020, jainr2019, ravikiranm2019, anget2018, guptad2018, rajendran2016')
fusion_techniques[['Attention Fusion']] <- c('bakkalis2023, chenz2023, guod2023, liangz2023, rasheeda2023, ronghaop2024, zhu2020')

summary_df <- create_bubble_df(year_multimodal_df, fusion_techniques)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(6))(6),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Fusion Techniques",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Fusion Techniques") %>% kableExtra::kable_styling()
Table 3.25: Table 3.26: Fusion Techniques
group YEAR n
11 Attention Fusion 2020 1
16 Attention Fusion 2022 1
20 Attention Fusion 2023 4
22 Attention Fusion 2024 1
1 Machine Learning based Fusion 2009 1
2 Machine Learning based Fusion 2010 1
4 Machine Learning based Fusion 2014 1
7 Machine Learning based Fusion 2017 1
8 Machine Learning based Fusion 2018 3
12 Machine Learning based Fusion 2020 1
14 Machine Learning based Fusion 2021 2
17 Machine Learning based Fusion 2022 3
6 Neural Network Fusion 2016 1
9 Neural Network Fusion 2018 2
10 Neural Network Fusion 2019 2
13 Neural Network Fusion 2020 2
15 Neural Network Fusion 2021 5
18 Neural Network Fusion 2022 8
21 Neural Network Fusion 2023 10
23 Neural Network Fusion 2024 6
3 Probabilistic Fusion 2010 1
5 Probabilistic Fusion 2014 1
19 Probabilistic Fusion 2022 1
24 Probabilistic Fusion 2024 1
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Fusion Techniques") %>% kableExtra::kable_styling()
Table 3.27: Table 3.28: Fusion Techniques
group count percentage label column
Attention Fusion 7 11.7 7 (11.7%) group
Machine Learning based Fusion 13 21.7 13 (21.7%) group
Neural Network Fusion 36 60.0 36 (60%) group
Probabilistic Fusion 4 6.7 4 (6.7%) group
# 1. Wrap the title
# Treemap of the Distribution of Works on Fusion Techniques (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 70
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mm-fusion-tech-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

3.1.8 Datasets

datasets <- list()
datasets[['Public Benchmark Datasets']] <- c('ghorbanali2024, liub2024, liut2024, tengfeil2024, wajdm2024, yushili2024, bakkalis2023, kenny2023, kozienkop2023, liangz2023, zouh2023, chenl2022, kanchid2022, wangq2022, gallo2021, zingaro2021, jainr2019, argon2018, matricm2018, tellez2018, rajendran2016, chens2009')
datasets[['Domain-specific Datasets']] <- c('jiangs2024, ronghaop2024, zhangy2024, arlqaraleshs2024, chenz2023, chos2023, fujinumay2023, guod2023, jarquin2023, jarrahia2023, linckere2023, luzdearau2023, ortizperez2023, rasheeda2023, reil2023, shah2023, adwaithd2022, andriyanovn2022, chatziagapia2022, chenl2022, dacosta2022, dongpin2022, guq2022, paraskevopoulos2022, sapeao2022, yuet2022, garg2021, guelorget2021, setiawan2021, wang2021, braz2020, carmona2020, debreuij2020, zhu2020, ravikiranm2019, akhiamov2018, anget2018, guptad2018, akhtiamovo2017, schmittm2017, cristanim2014, liparas2014, perezgraciat2010, zhangx2010')
datasets[['Multilingual Datasets']] <- c('chos2023, fujinumay2023, liangz2023, ma2021, argon2018, matricm2018')

summary_df <- create_bubble_df(year_multimodal_df, datasets)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(6))(6),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Datasets",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Datasets") %>% kableExtra::kable_styling()
Table 3.29: Table 3.30: Datasets
group YEAR n
2 Domain-specific Datasets 2010 2
3 Domain-specific Datasets 2014 2
5 Domain-specific Datasets 2017 2
6 Domain-specific Datasets 2018 3
9 Domain-specific Datasets 2019 1
11 Domain-specific Datasets 2020 4
12 Domain-specific Datasets 2021 4
15 Domain-specific Datasets 2022 14
17 Domain-specific Datasets 2023 9
20 Domain-specific Datasets 2024 3
7 Multilingual Datasets 2018 2
13 Multilingual Datasets 2021 1
18 Multilingual Datasets 2023 3
1 Public Benchmark Datasets 2009 1
4 Public Benchmark Datasets 2016 1
8 Public Benchmark Datasets 2018 3
10 Public Benchmark Datasets 2019 1
14 Public Benchmark Datasets 2021 2
16 Public Benchmark Datasets 2022 3
19 Public Benchmark Datasets 2023 6
21 Public Benchmark Datasets 2024 5
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Datasets") %>% kableExtra::kable_styling()
Table 3.31: Table 3.32: Datasets
group count percentage label column
Domain-specific Datasets 44 61.1 44 (61.1%) group
Multilingual Datasets 6 8.3 6 (8.3%) group
Public Benchmark Datasets 22 30.6 22 (30.6%) group
# 1. Wrap the title
# Treemap of the Distribution of Works on Datasets (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mm-datasets-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

3.2 Datasets, Models, and Performance metrics selected for comparison

GG_PLOTS_REGISTER <- list()

# Rename the columns to make them easier to work with
multimodal_df_renamed <- multimodal_df %>%
  rename(
    datasets = 'Identified the datasets used in the article',
    dis_datasets = 'Disambiguated datasets names',
    models = 'What other models were selected for comparison?',
    dis_models = 'Disambiguated models names',
    metrics = 'Identified performance metrics used in the article',
    dis_metrics = 'Disambiguated performance metrics names'
  )

# Create better titles for each column
title_mapping <- c(
  "dis_datasets" = "Disambiguated datasets names",
  "dis_models" = "Disambiguated models names",
  "dis_metrics" = "Disambiguated performance metrics names"
)

3.2.1 Datasets

# Clean and standardize responses - convert everything to lower case and handle variations
multimodal_dis_df <- multimodal_df_renamed %>%
  select('l.p', 'dis_datasets') %>%
  separate_rows('dis_datasets', sep = "\\r?\\n") %>%
  mutate(
    dis_datasets = stringr::str_to_lower(dis_datasets),
    dis_datasets = stringr::str_trim(dis_datasets),
    dis_datasets = na_if(dis_datasets, "")
  )

summary_df <- create_summary_df(multimodal_dis_df, 'dis_datasets') %>%
  arrange(desc(count)) %>% filter(!is.na(group))

summary_cuted_df <- summary_df %>%
  # 1) lump any low‑frequency group into "other"
  mutate(group = if_else(count < 4, "other", group)) %>%
  # 2) re‑aggregate by the (possibly new) group
  group_by(group, column) %>%
  summarise(count = sum(count), .groups = "drop") %>%
  # 3) recompute percentages and labels over the new totals
  mutate(percentage = count / sum(count) * 100,
         label      = paste0(count, " \n (", round(percentage, 1), "%)")) %>%
  # 4) order by descending count
  arrange(desc(count))

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(7))(7),
                                  unique(summary_cuted_df$group))

ggplot_chart <- create_treemap_chart(
  summary_df = summary_cuted_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)

ggsave(
  "fig-mm-eval-datasets-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['dis_datasets']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['dis_datasets']) %>% kableExtra::kable_styling()
Table 3.33: Table 3.34: Disambiguated datasets names
group count percentage label column
food-101 6 3.1 6 (3.1%) dis_datasets
rvl-cdip 6 3.1 6 (3.1%) dis_datasets
twitter data 6 3.1 6 (3.1%) dis_datasets
wikipedia data 5 2.6 5 (2.6%) dis_datasets
amazon reviews 4 2.0 4 (2%) dis_datasets
tobacco document images 4 2.0 4 (2%) dis_datasets
fakenewsnet 3 1.5 3 (1.5%) dis_datasets
mmaterials 3 1.5 3 (1.5%) dis_datasets
mvsa (multimodal sentiment analysis) 3 1.5 3 (1.5%) dis_datasets
yelp reviews 3 1.5 3 (1.5%) dis_datasets
ag news 2 1.0 2 (1%) dis_datasets
da-vincis challenge data 2 1.0 2 (1%) dis_datasets
iemocap 2 1.0 2 (1%) dis_datasets
maapd 2 1.0 2 (1%) dis_datasets
pan 2018 authorship profiling 2 1.0 2 (1%) dis_datasets
reddit data 2 1.0 2 (1%) dis_datasets
reuters corpora 2 1.0 2 (1%) dis_datasets
roco (radiology objects in context) 2 1.0 2 (1%) dis_datasets
smart video corpus (svc) 2 1.0 2 (1%) dis_datasets
20 newsgroups 1 0.5 1 (0.5%) dis_datasets
aifun 1 0.5 1 (0.5%) dis_datasets
alector 1 0.5 1 (0.5%) dis_datasets
arabic book cover-28 dataset 1 0.5 1 (0.5%) dis_datasets
arousal dataset 1 0.5 1 (0.5%) dis_datasets
arxiv academic papers 1 0.5 1 (0.5%) dis_datasets
atcspeech corpus 1 0.5 1 (0.5%) dis_datasets
avocado 1 0.5 1 (0.5%) dis_datasets
band-in-a-box files 1 0.5 1 (0.5%) dis_datasets
bbc news 1 0.5 1 (0.5%) dis_datasets
bill of loading (bl) 1 0.5 1 (0.5%) dis_datasets
book cover-28-ext dataset 1 0.5 1 (0.5%) dis_datasets
bookcover30 dataset 1 0.5 1 (0.5%) dis_datasets
business registration certificate (brc) 1 0.5 1 (0.5%) dis_datasets
chexpert 1 0.5 1 (0.5%) dis_datasets
chnsenticorp 1 0.5 1 (0.5%) dis_datasets
cifar-10 1 0.5 1 (0.5%) dis_datasets
cmu-moseas 1 0.5 1 (0.5%) dis_datasets
cockamamie gobbledegook 1 0.5 1 (0.5%) dis_datasets
common voice 6.1 1 0.5 1 (0.5%) dis_datasets
cub-200 1 0.5 1 (0.5%) dis_datasets
custom dataset 1 0.5 1 (0.5%) dis_datasets
dataset was collected from the chengdu atc center 1 0.5 1 (0.5%) dis_datasets
dbpedia 1 0.5 1 (0.5%) dis_datasets
deepchart 1 0.5 1 (0.5%) dis_datasets
demcare 1 0.5 1 (0.5%) dis_datasets
dementiabank pitt corpus 1 0.5 1 (0.5%) dis_datasets
deprem dataset (earthquake) 1 0.5 1 (0.5%) dis_datasets
docfigure 1 0.5 1 (0.5%) dis_datasets
emofilm 1 0.5 1 (0.5%) dis_datasets
emotion dataset 1 0.5 1 (0.5%) dis_datasets
emotional data (also referred to as sentimenti) 1 0.5 1 (0.5%) dis_datasets
enron 1 0.5 1 (0.5%) dis_datasets
fantastiques exercices 1 0.5 1 (0.5%) dis_datasets
ferramenta 1 0.5 1 (0.5%) dis_datasets
ferramenta dataset 1 0.5 1 (0.5%) dis_datasets
filled pauses dataset 1 0.5 1 (0.5%) dis_datasets
geonames database 1 0.5 1 (0.5%) dis_datasets
global satellite mapping of precipitation (gsmap) 1 0.5 1 (0.5%) dis_datasets
globalphone 1 0.5 1 (0.5%) dis_datasets
golden-chnsenticorp (g-chnsenticorp) 1 0.5 1 (0.5%) dis_datasets
gossipcop 1 0.5 1 (0.5%) dis_datasets
greek filtered c4 (gfc4) 1 0.5 1 (0.5%) dis_datasets
ground truth database team of object and concept recognition for content-based image retrieval of university of washington (image dataset) 1 0.5 1 (0.5%) dis_datasets
hellenic national corpus (hnc) 1 0.5 1 (0.5%) dis_datasets
historical flood events data from cge-sp 1 0.5 1 (0.5%) dis_datasets
historical rainfall data (2009–2018) 1 0.5 1 (0.5%) dis_datasets
hydrobasins dataset level 9 1 0.5 1 (0.5%) dis_datasets
i-ctx 1 0.5 1 (0.5%) dis_datasets
i-int 1 0.5 1 (0.5%) dis_datasets
i-sem 1 0.5 1 (0.5%) dis_datasets
iit-cdip 1 0.5 1 (0.5%) dis_datasets
imdb movie reviews 1 0.5 1 (0.5%) dis_datasets
imdb-wiki dataset 1 0.5 1 (0.5%) dis_datasets
in-house dataset of documents 1 0.5 1 (0.5%) dis_datasets
indonesian social media posts dataset 1 0.5 1 (0.5%) dis_datasets
inter1sp 1 0.5 1 (0.5%) dis_datasets
ipc (international patent classification) system 1 0.5 1 (0.5%) dis_datasets
level 1 classifier dataset 1 0.5 1 (0.5%) dis_datasets
level 2 classifier dataset 1 0.5 1 (0.5%) dis_datasets
librispeech 1 0.5 1 (0.5%) dis_datasets
local newspaper dataset 1 0.5 1 (0.5%) dis_datasets
luxury standard 1 0.5 1 (0.5%) dis_datasets
malwareqrdb 1 0.5 1 (0.5%) dis_datasets
malwaretextdb-v2 1 0.5 1 (0.5%) dis_datasets
manulex 1 0.5 1 (0.5%) dis_datasets
mc-30 1 0.5 1 (0.5%) dis_datasets
medicat 1 0.5 1 (0.5%) dis_datasets
medvqa-2019 1 0.5 1 (0.5%) dis_datasets
men-tr-3000 1 0.5 1 (0.5%) dis_datasets
meteorological data from inmet 1 0.5 1 (0.5%) dis_datasets
mexican twitter corpus (mex-a3t-500) 1 0.5 1 (0.5%) dis_datasets
midi files 1 0.5 1 (0.5%) dis_datasets
mimic-cxr 1 0.5 1 (0.5%) dis_datasets
mscoco dataset 1 0.5 1 (0.5%) dis_datasets
multieurlex-doc 1 0.5 1 (0.5%) dis_datasets
multilingual ted corpus 1 0.5 1 (0.5%) dis_datasets
mura (musculoskeletal radiographs) dataset 1 0.5 1 (0.5%) dis_datasets
müsilaj dataset (sea saliva) 1 0.5 1 (0.5%) dis_datasets
n24news 1 0.5 1 (0.5%) dis_datasets
news visual-text dataset (nvtd) 1 0.5 1 (0.5%) dis_datasets
newsplease 1 0.5 1 (0.5%) dis_datasets
nist special database 6 (also referred to as nist-tax form or nist) 1 0.5 1 (0.5%) dis_datasets
nus-wide dataset family 1 0.5 1 (0.5%) dis_datasets
office-caltech 1 0.5 1 (0.5%) dis_datasets
office-home 1 0.5 1 (0.5%) dis_datasets
praxis gesture 1 0.5 1 (0.5%) dis_datasets
r-pop 1 0.5 1 (0.5%) dis_datasets
radnli 1 0.5 1 (0.5%) dis_datasets
ravdess 1 0.5 1 (0.5%) dis_datasets
recola 1 0.5 1 (0.5%) dis_datasets
review 1 0.5 1 (0.5%) dis_datasets
rg-65 1 0.5 1 (0.5%) dis_datasets
rsna pneumonia 1 0.5 1 (0.5%) dis_datasets
safety4all 1 0.5 1 (0.5%) dis_datasets
savee 1 0.5 1 (0.5%) dis_datasets
saganak dataset (downpour) 1 0.5 1 (0.5%) dis_datasets
sd dataset 1 0.5 1 (0.5%) dis_datasets
seempad 1 0.5 1 (0.5%) dis_datasets
self-made dataset 1 0.5 1 (0.5%) dis_datasets
semeval-2022 multimedia automatic misogyny identification (mami) dataset<U+200B>. 1 0.5 1 (0.5%) dis_datasets
sensitive images dataset 1 0.5 1 (0.5%) dis_datasets
sewa (automatic sentiment analysis in the wild) 1 0.5 1 (0.5%) dis_datasets
shinra2020-ml dataset 1 0.5 1 (0.5%) dis_datasets
slake 1 0.5 1 (0.5%) dis_datasets
sogou news/text 1 0.5 1 (0.5%) dis_datasets
spanish meacorpus 2023 1 0.5 1 (0.5%) dis_datasets
speaking rate dataset 1 0.5 1 (0.5%) dis_datasets
stanford sentiment treebank (sst) 1 0.5 1 (0.5%) dis_datasets
svic+ 1 0.5 1 (0.5%) dis_datasets
switchboard corpus (specifically the switchboard-nxt release) 1 0.5 1 (0.5%) dis_datasets
swt dataset 1 0.5 1 (0.5%) dis_datasets
t-st1 1 0.5 1 (0.5%) dis_datasets
t-st2 1 0.5 1 (0.5%) dis_datasets
t-vis 1 0.5 1 (0.5%) dis_datasets
t4sa 1 0.5 1 (0.5%) dis_datasets
tass 2017 1 0.5 1 (0.5%) dis_datasets
the dataset used in the article is the fault records dataset from a factory, which includes both text sequence data and time series data associated with four different fault types. 1 0.5 1 (0.5%) dis_datasets
the guardian 1 0.5 1 (0.5%) dis_datasets
the main dataset used in this article is the silknow knowledge graph dataset. no other specific dataset names are mentioned in the article. 1 0.5 1 (0.5%) dis_datasets
trafik kazasi dataset (traffic accident) 1 0.5 1 (0.5%) dis_datasets
trec question/text classification 1 0.5 1 (0.5%) dis_datasets
tweets and images from users in english, spanish, and arabic. 1 0.5 1 (0.5%) dis_datasets
twitter-15 1 0.5 1 (0.5%) dis_datasets
university lecture recordings from universitat politècnica de valència (upv) 1 0.5 1 (0.5%) dis_datasets
uspto (united states patent and trademark office) patent dataset 1 0.5 1 (0.5%) dis_datasets
vad (voice activity detection) dataset 1 0.5 1 (0.5%) dis_datasets
valence dataset 1 0.5 1 (0.5%) dis_datasets
vqa-rad 1 0.5 1 (0.5%) dis_datasets
weibo 1 0.5 1 (0.5%) dis_datasets
wiki-doc 1 0.5 1 (0.5%) dis_datasets
ws-353 1 0.5 1 (0.5%) dis_datasets
yahoo answers 1 0.5 1 (0.5%) dis_datasets
yangin dataset (fire) 1 0.5 1 (0.5%) dis_datasets

3.2.2 Models selected for comparison

# Clean and standardize responses - convert everything to lower case and handle variations
multimodal_dis_df <- multimodal_df_renamed %>%
  select('l.p', 'dis_models') %>%
  separate_rows('dis_models', sep = "\\r?\\n") %>%
  mutate(
    dis_models = stringr::str_to_lower(dis_models),
    dis_models = stringr::str_trim(dis_models),
    dis_models = na_if(dis_models, "")
  )

summary_df <- create_summary_df(multimodal_dis_df, 'dis_models') %>%
  arrange(desc(count)) %>% filter(!is.na(group))

summary_cuted_df <- summary_df %>%
  # 1) lump any low‑frequency group into "other"
  mutate(group = if_else(count < 20, "other", group)) %>%
  # 2) re‑aggregate by the (possibly new) group
  group_by(group, column) %>%
  summarise(count = sum(count), .groups = "drop") %>%
  # 3) recompute percentages and labels over the new totals
  mutate(percentage = count / sum(count) * 100,
         label      = paste0(count, " (", round(percentage, 1), "%)")) %>%
  # 4) order by descending count
  arrange(desc(count))

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(7))(7),
                                  unique(summary_cuted_df$group))

ggplot_chart <- create_treemap_chart(
  summary_df = summary_cuted_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)

ggsave(
  "fig-mm-eval-models-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['dis_models']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['dis_models']) %>% kableExtra::kable_styling()
Table 3.35: Table 3.36: Disambiguated models names
group count percentage label column
multi-modal/multi-view specific architectures & techniques 61 16.3 61 (16.3%) dis_models
convolutional neural networks (cnn) 41 10.9 41 (10.9%) dis_models
traditional/statistical machine learning & other methods 24 6.4 24 (6.4%) dis_models
transformer-based architectures 23 6.1 23 (6.1%) dis_models
bert (bidirectional encoder representations from transformers) 22 5.9 22 (5.9%) dis_models
long short-term memory (lstm) 20 5.3 20 (5.3%) dis_models
vggnet 16 4.3 16 (4.3%) dis_models
word embeddings and text representation 14 3.7 14 (3.7%) dis_models
resnet 13 3.5 13 (3.5%) dis_models
support vector machine (svm) 13 3.5 13 (3.5%) dis_models
layoutlm family (document ai transformers) 9 2.4 9 (2.4%) dis_models
roberta (robustly optimized bert pretraining approach) 8 2.1 8 (2.1%) dis_models
alexnet 7 1.9 7 (1.9%) dis_models
googlenet 6 1.6 6 (1.6%) dis_models
self-attention / attention 6 1.6 6 (1.6%) dis_models
naive bayes 5 1.3 5 (1.3%) dis_models
neural network 5 1.3 5 (1.3%) dis_models
densenet 4 1.1 4 (1.1%) dis_models
random forest 4 1.1 4 (1.1%) dis_models
vision transformer (vit) 4 1.1 4 (1.1%) dis_models
boosting methods 3 0.8 3 (0.8%) dis_models
decision tree 3 0.8 3 (0.8%) dis_models
distilbert 3 0.8 3 (0.8%) dis_models
gated recurrent unit (gru) 3 0.8 3 (0.8%) dis_models
k-nearest neighbors (knn) 3 0.8 3 (0.8%) dis_models
logistic regression 3 0.8 3 (0.8%) dis_models
recurrent neural network (rnn) 3 0.8 3 (0.8%) dis_models
xlnet 3 0.8 3 (0.8%) dis_models
albert 2 0.5 2 (0.5%) dis_models
bigbird 2 0.5 2 (0.5%) dis_models
efficientnet 2 0.5 2 (0.5%) dis_models
tobert 2 0.5 2 (0.5%) dis_models
vilt 2 0.5 2 (0.5%) dis_models
<U+200B> 2 0.5 2 (0.5%) dis_models
. 1 0.3 1 (0.3%) dis_models
autoencoders 1 0.3 1 (0.3%) dis_models
bagging 1 0.3 1 (0.3%) dis_models
convolutional recurrent neural network (crnn) 1 0.3 1 (0.3%) dis_models
deep multi-level attentive network (dmlanet) 1 0.3 1 (0.3%) dis_models
deeppatent 1 0.3 1 (0.3%) dis_models
gr-electra 1 0.3 1 (0.3%) dis_models
graph neural networks (gnn) 1 0.3 1 (0.3%) dis_models
latent dirichlet allocation (lda) 1 0.3 1 (0.3%) dis_models
latent semantic analysis (lsa) 1 0.3 1 (0.3%) dis_models
linear model 1 0.3 1 (0.3%) dis_models
longformer 1 0.3 1 (0.3%) dis_models
lstm (word2vec) - cnn (mouzannar, 2018) 1 0.3 1 (0.3%) dis_models
maria 1 0.3 1 (0.3%) dis_models
mobilenet 1 0.3 1 (0.3%) dis_models
nasnet 1 0.3 1 (0.3%) dis_models
robert 1 0.3 1 (0.3%) dis_models
rocchio classifier (centroid classifier) 1 0.3 1 (0.3%) dis_models
transformer 1 0.3 1 (0.3%) dis_models
xgboost (xgb) 1 0.3 1 (0.3%) dis_models
<U+200B><U+200B> 1 0.3 1 (0.3%) dis_models

3.2.3 Performance metrics

# Clean and standardize responses - convert everything to lower case and handle variations
multimodal_dis_df <- multimodal_df_renamed %>%
  select('l.p', 'dis_metrics') %>%
  separate_rows('dis_metrics', sep = "\\r?\\n") %>%
  mutate(
    dis_metrics = stringr::str_to_lower(dis_metrics),
    dis_metrics = stringr::str_trim(dis_metrics),
    dis_metrics = na_if(dis_metrics, "")
  )

summary_df <- create_summary_df(multimodal_dis_df, 'dis_metrics') %>%
  arrange(desc(count)) %>% filter(!is.na(group))

summary_cuted_df <- summary_df %>%
  # 1) lump any low‑frequency group into "other"
  mutate(group = if_else(count < 11, "other", group)) %>%
  # 2) re‑aggregate by the (possibly new) group
  group_by(group, column) %>%
  summarise(count = sum(count), .groups = "drop") %>%
  # 3) recompute percentages and labels over the new totals
  mutate(percentage = count / sum(count) * 100,
         label      = paste0(count, " (", round(percentage, 1), "%)")) %>%
  # 4) order by descending count
  arrange(desc(count))

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(7))(7),
                                  unique(summary_cuted_df$group))

ggplot_chart <- create_treemap_chart(
  summary_df = summary_cuted_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)

ggsave(
  "fig-mm-eval-per-met-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['dis_metrics']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['dis_metrics']) %>% kableExtra::kable_styling()
Table 3.37: Table 3.38: Disambiguated performance metrics names
group count percentage label column
accuracy 53 25.4 53 (25.4%) dis_metrics
f1-score 35 16.7 35 (16.7%) dis_metrics
precision 25 12.0 25 (12%) dis_metrics
recall 25 12.0 25 (12%) dis_metrics
macro f1-score 11 5.3 11 (5.3%) dis_metrics
computational and resource metrics 5 2.4 5 (2.4%) dis_metrics
area under the roc curve (auc/auroc) 4 1.9 4 (1.9%) dis_metrics
loss (general) 4 1.9 4 (1.9%) dis_metrics
nlp-specific metrics 4 1.9 4 (1.9%) dis_metrics
correlation coefficients 3 1.4 3 (1.4%) dis_metrics
macro recall 3 1.4 3 (1.4%) dis_metrics
variability and confidence metrics 3 1.4 3 (1.4%) dis_metrics
entropy/divergence metrics 2 1.0 2 (1%) dis_metrics
kappa statistics 2 1.0 2 (1%) dis_metrics
macro precision 2 1.0 2 (1%) dis_metrics
other specific metrics 2 1.0 2 (1%) dis_metrics
recall at k () 2 1.0 2 (1%) dis_metrics
unweighted average recall (uar) 2 1.0 2 (1%) dis_metrics
weighted f1-score 2 1.0 2 (1%) dis_metrics
balanced accuracy 1 0.5 1 (0.5%) dis_metrics
computational efficiency 1 0.5 1 (0.5%) dis_metrics
confusion matrix 1 0.5 1 (0.5%) dis_metrics
distance and similarity metrics 1 0.5 1 (0.5%) dis_metrics
macro-averaged scores 1 0.5 1 (0.5%) dis_metrics
mean absolute percentage error (mape) 1 0.5 1 (0.5%) dis_metrics
mean r-precision (mrp) 1 0.5 1 (0.5%) dis_metrics
mean reciprocal rank (mrr) 1 0.5 1 (0.5%) dis_metrics
micro f1-score 1 0.5 1 (0.5%) dis_metrics
micro precision 1 0.5 1 (0.5%) dis_metrics
micro recall 1 0.5 1 (0.5%) dis_metrics
testing error 1 0.5 1 (0.5%) dis_metrics
validation ratio 1 0.5 1 (0.5%) dis_metrics
weighted f1-score<U+200B> 1 0.5 1 (0.5%) dis_metrics

3.2.4 All plots

# Treemap charts of the distribution of datasets, models and performance metrixes used in multimodal works
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

title_grob <- grid::textGrob(wrapped_title,
                             gp = grid::gpar(fontsize = 14, fontface = "plain"),
                             just = "center")

layout <- rbind(
  c(1, 1),
  c(2, 3),
  c(4, 4)
)

combined_plot <- gridExtra::grid.arrange(
  title_grob,
  GG_PLOTS_REGISTER[['dis_datasets']], GG_PLOTS_REGISTER[['dis_metrics']], GG_PLOTS_REGISTER[['dis_models']],
  layout_matrix = layout,
  heights       = c(1, 10, 10)  # tweak these ratios to taste
)

ggsave(
  "fig-mm-datasets-models-met-pie.pdf",
  plot = combined_plot,
  width = 14,
  height = 8
)
plot(combined_plot)

3.3 Validation procedure

GG_PLOTS_REGISTER <- list()

# Rename the columns to make them easier to work with
multimodal_df_renamed <- multimodal_df %>%
  rename(
    test_set_used = 'Was a test set or cross-validation procedure used to determine the performance metrics values? Please write only yes, or no.',
    statistical_tests = 'Were statistical tests used? Please write only yes, or no.',
    other_analysis_methods = 'Did other methods of analyzing the outcomes, for example Bayesian, explainable machine learning methods, be used? Please write yes, or no.',
    ablation_studies = 'Were ablation studies and/or comparisons with unimodal classifiers performed? Please write yes, or no.',
    replication_info = 'Did the authors provide enough information (data, code) to allow for replication of the study? Please write only yes, or no.'
  )

# Clean and standardize responses - convert everything to lower case and handle variations
multimodal_df_clean <- multimodal_df_renamed %>%
  mutate(across(
    c(
      test_set_used,
      statistical_tests,
      other_analysis_methods,
      ablation_studies,
      replication_info
    ),
    ~ case_when(
      tolower(.) %in% c("yes", "y") ~ "Yes",
      tolower(.) %in% c("no", "n") ~ "No",
      is.na(.) ~ "No",
      TRUE ~ "No"
    )
  ))

# Create better titles for each column
title_mapping <- c(
  "test_set_used" = "Test Set or Cross-Validation Used",
  "statistical_tests" = "Statistical Tests Used",
  "other_analysis_methods" = "Alternative Analysis Methods Used",
  "ablation_studies" = "Ablation Studies Performed",
  "replication_info" = "Replication Information Provided"
)
summary_df <- create_summary_df(multimodal_df_clean, 'test_set_used')
global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(2))(2),
                                  unique(summary_df$group))

ggplot_chart <- create_pie_chart(summary_df, title_mapping['test_set_used'], global_palette = global_palette)

ggsave(
  "fig-mm-eval-test-set-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['test_set_used']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['test_set_used']) %>% kableExtra::kable_styling()
Table 3.39: Table 3.40: Test Set or Cross-Validation Used
group count percentage label column
No 5 7.4 5 (7.4%) test_set_used
Yes 63 92.6 63 (92.6%) test_set_used
summary_df <- create_summary_df(multimodal_df_clean, 'statistical_tests')
global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(2))(2),
                                  unique(summary_df$group))
ggplot_chart <- create_pie_chart(summary_df, title_mapping['statistical_tests'], global_palette = global_palette)
  
ggsave("fig-mm-eval-stat-test-pie.pdf", plot = ggplot_chart, width = 14, height = 8)

GG_PLOTS_REGISTER[['statistical_tests']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['statistical_tests']) %>% kableExtra::kable_styling()
Table 3.41: Table 3.42: Statistical Tests Used
group count percentage label column
No 60 88.2 60 (88.2%) statistical_tests
Yes 8 11.8 8 (11.8%) statistical_tests
summary_df <- create_summary_df(multimodal_df_clean, 'other_analysis_methods')
global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(2))(2),
                                  unique(summary_df$group))
ggplot_chart <- create_pie_chart(summary_df, title_mapping['other_analysis_methods'], global_palette = global_palette)

ggsave(
  "fig-mm-eval-other-test-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['other_analysis_methods']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['other_analysis_methods']) %>% kableExtra::kable_styling()
Table 3.43: Table 3.44: Alternative Analysis Methods Used
group count percentage label column
No 66 97.1 66 (97.1%) other_analysis_methods
Yes 2 2.9 2 (2.9%) other_analysis_methods
summary_df <- create_summary_df(multimodal_df_clean, 'ablation_studies')
global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(2))(2),
                                  unique(summary_df$group))
ggplot_chart <- create_pie_chart(summary_df, title_mapping['ablation_studies'], global_palette = global_palette)

ggsave(
  "fig-mm-eval-ablation-study-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['ablation_studies']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['ablation_studies']) %>% kableExtra::kable_styling()
Table 3.45: Table 3.46: Ablation Studies Performed
group count percentage label column
No 4 5.9 4 (5.9%) ablation_studies
Yes 64 94.1 64 (94.1%) ablation_studies
summary_df <- create_summary_df(multimodal_df_clean, 'replication_info')
global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(2))(2),
                                  unique(summary_df$group))
ggplot_chart <- create_pie_chart(summary_df, title_mapping['replication_info'], global_palette = global_palette)

ggsave(
  "fig-mm-eval-replication-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['replication_info']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['replication_info']) %>% kableExtra::kable_styling()
Table 3.47: Table 3.48: Replication Information Provided
group count percentage label column
No 50 73.5 50 (73.5%) replication_info
Yes 18 26.5 18 (26.5%) replication_info

3.3.1 All plots

# Pie charts of studies evaluation & replication
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

title_grob <- grid::textGrob(wrapped_title,
                             gp = grid::gpar(fontsize = 14, fontface = "plain"),
                             just = "center")

layout <- rbind(
  c(1, NA, 2),  # Top row: plot 1, empty, plot 2
  c(NA, 3, NA), # Middle row: empty, plot 3, empty
  c(4, NA, 5)   # Bottom row: plot 4, empty, plot 5
)

combined_plot <- gridExtra::grid.arrange(
  GG_PLOTS_REGISTER[['test_set_used']],      # position 1: top-left
  GG_PLOTS_REGISTER[['statistical_tests']],  # position 2: top-right
  GG_PLOTS_REGISTER[['other_analysis_methods']], # position 3: middle-center
  GG_PLOTS_REGISTER[['ablation_studies']],   # position 4: bottom-left
  GG_PLOTS_REGISTER[['replication_info']],   # position 5: bottom-right
  layout_matrix = layout
)

combined_plot <- gridExtra::grid.arrange(
  title_grob,
  combined_plot,
  ncol = 1,
  heights = c(0.5, 10)  # Title takes less space than the plots
)

ggsave(
  "fig-mm-eval-test-set-tests-abl-rep-pie.pdf",
  plot = combined_plot,
  width = 14,
  height = 8
)
plot(combined_plot)

4 Multiview reserch papper analysis

multiview_df <- excel_df %>%
  filter(bibtexkey %in% MULTIVIEW_BIBTEXKEY)

head(multiview_df)
## # A tibble: 6 x 21
##     l.p doi        bibtexkey `Article title` `Short note` Please summarize or ~1
##   <dbl> <chr>      <chr>     <chr>           <chr>        <chr>                 
## 1     3 10.1109/A~ gui2021   "Technology Fo~ "The study ~ "The paper \"Technolo~
## 2     4 10.18653/~ huc2021   "One-class Tex~ "The study ~ "The paper \"One-clas~
## 3     8 10.1016/j~ zhang2021 "Learning sent~ "The study ~ "The paper titled \"L~
## 4    10 10.1016/j~ liang2021 "Fusion of het~ "The study ~ "The paper titled \"F~
## 5    12 10.1109/A~ sus2021   "A Dynamic Dis~ "The study ~ "The paper presents a~
## 6    13 10.1111/e~ zhao2023  "Topic identif~ "The study ~ "The paper \"Topic id~
## # i abbreviated name:
## #   1: `Please summarize or provide the most important details of the work. Use a maximum of five sentences.`
## # i 15 more variables: `What are the findings?` <chr>,
## #   `What are the challenges?` <chr>,
## #   `Identified the datasets used in the article` <chr>,
## #   `Disambiguated datasets names` <chr>,
## #   `What other models were selected for comparison?` <chr>, ...
paste("Extracted", nrow(multiview_df), "multiview papers out of", nrow(excel_df), "total papers")
## [1] "Extracted 73 multiview papers out of 139 total papers"

4.1 Topics

year_multiview_df <- multiview_df %>%
  mutate(bibtexkey = tolower(bibtexkey)) %>%
  inner_join(
    bib_df %>% mutate(BIBTEXKEY = tolower(BIBTEXKEY)) %>% select('BIBTEXKEY', 'YEAR'),
    by = c("bibtexkey" = "BIBTEXKEY")
  )

4.1.1 General Framework

general_framework <- list()
general_framework[['When to use multi-view learning']] <-   c('brefeldu2015', 'aminim2009')
general_framework[['Handling missing views']] <-    c('zhangqi2024', 'doinychko2020', 'aminim2009')
general_framework[['Multi-view representation learning']] <-    c('samya2023', 'sangy2022', 'jiax2021', 'liang2021', 'sus2021', 'yangp2014', 'zhang2021', 'maf2020', 'wangh2020', 'bhatt2019', 'chens2019', 'hoylea2019', 'wangh2019', 'ferreira2018', 'zhup2018', 'zhanz2017', 'perinaa2013', 'zhangb2013', 'zhangd2013', 'guyo2012', 'kovesim2012', 'yangp2012', 'zhengw2011', 'zhangb2008') 
general_framework[['Multi-view data fusion']] <-    c('fengz2024', 'jiz2024', 'xuy2024', 'varmanp2023', 'zhao2023', 'cgoncalves2022', 'liuj2022', 'luox2022', 'gui2021', 'huc2021', 'liuw2021', 'mmironczuk2020', 'mmironczuk2019', 'pengj2018', 'huz2017', 'sinorar2016', 'xuh2016', 'fakri2015', 'liuj2014', 'longg2013', 'lig2012', 'aminim2010', 'aminim2010b', 'suns2010', 'zhangx2010b', 'suns2008', 'matsubara2005', 'dasigiv2001')

summary_df <- create_bubble_df(year_multiview_df, general_framework)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(5))(5),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "General Framework",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "General Framework") %>% kableExtra::kable_styling()
Table 4.1: Table 4.2: General Framework
group YEAR n
5 Handling missing views 2009 1
25 Handling missing views 2020 1
33 Handling missing views 2024 1
1 Multi-view data fusion 2001 1
2 Multi-view data fusion 2005 1
3 Multi-view data fusion 2008 1
6 Multi-view data fusion 2009 1
8 Multi-view data fusion 2010 3
10 Multi-view data fusion 2012 1
12 Multi-view data fusion 2013 1
14 Multi-view data fusion 2014 1
16 Multi-view data fusion 2015 1
18 Multi-view data fusion 2016 2
19 Multi-view data fusion 2017 1
21 Multi-view data fusion 2018 1
23 Multi-view data fusion 2019 2
26 Multi-view data fusion 2020 1
28 Multi-view data fusion 2021 3
30 Multi-view data fusion 2022 4
34 Multi-view data fusion 2024 3
4 Multi-view representation learning 2008 1
9 Multi-view representation learning 2011 1
11 Multi-view representation learning 2012 3
13 Multi-view representation learning 2013 3
15 Multi-view representation learning 2014 1
20 Multi-view representation learning 2017 1
22 Multi-view representation learning 2018 2
24 Multi-view representation learning 2019 5
27 Multi-view representation learning 2020 1
29 Multi-view representation learning 2021 4
31 Multi-view representation learning 2022 1
32 Multi-view representation learning 2023 1
7 When to use multi-view learning 2009 1
17 When to use multi-view learning 2015 1
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "General Framework") %>% kableExtra::kable_styling()
Table 4.3: Table 4.4: General Framework
group count percentage label column
Handling missing views 3 5.3 3 (5.3%) group
Multi-view data fusion 28 49.1 28 (49.1%) group
Multi-view representation learning 24 42.1 24 (42.1%) group
When to use multi-view learning 2 3.5 2 (3.5%) group
# 1. Wrap the title at ~60 characters per line
# Treemap of the Distribution of Works on Multimodal–General Framework (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mv-gen-frem-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

4.1.2 Task

tasks <- list()
tasks[['Finance']] <- c('zhao2023')
tasks[['Social media analysis']] <- c('graffm2023', 'tianl2023', 'karisanip2022')
tasks[['Hate speech and offensive language detection']] <- c('graffm2023', 'lij2020')
tasks[['Fake news detection']] <- c('varmanp2023')
tasks[['Emotion recognition']] <- c('zhang2021')
tasks[['Sentiment analysis']] <- c('zhang2021', 'hoylea2019')
tasks[['Author profiling']] <- c('carmona2020')
tasks[['Technology forecasting']] <- c('gui2021')
tasks[['Multilingual text categorization']] <- c('sus2021', 'doinychko2020', 'maf2020', 'bhatt2019', 'zhanz2017', 'rajendran2016', 'fakri2015', 'guyo2012', 'kovesim2012', 'aminim2010', 'aminim2010b', 'aminim2009')
tasks[['Adversarial text classification']] <- c('lij2020')
tasks[['Lobbying disclosure analysis']] <- c('liaox2015')
tasks[['Software document classification']] <- c('liuj2014')
tasks[['Short text classification']] <- c('luox2022', 'longg2013')
tasks[['Web document classification']] <- c('mmironczuk2020', 'mmironczuk2019', 'suns2010', 'chenb2009', 'gup2009', 'zhangx2009', 'suns2008')
tasks[['News Article Classification']] <- c('dasigiv2001')
tasks[['Biomedical document, article analysis']] <- c('cgoncalves2022')
tasks[['Healthcare']] <- c('liuj2022', 'huz2017', 'xuh2016')
tasks[['Personality Prediction']] <- c('sangy2022')
tasks[['Extreme multi-label text classification']] <- c('chens2019')
tasks[['Other multi-view tasks']] <- c('akhtiamov2019', 'ferreira2018')

summary_df <- create_bubble_df(year_multiview_df, tasks)

base <- ggsci::pal_jco("default")(10)
extra <- c(lighten(base, 0.3), darken(base, 0.2))
palette_23 <- c(base, extra)[1:23]
global_palette <- stats::setNames(palette_23, unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Tasks",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Tasks") %>% kableExtra::kable_styling()
Table 4.5: Table 4.6: Tasks
group YEAR n
22 Adversarial text classification 2020 1
23 Author profiling 2020 1
31 Biomedical document, article analysis 2022 1
27 Emotion recognition 2021 1
17 Extreme multi-label text classification 2019 1
32 Fake news detection 2022 1
24 Finance 2020 1
25 Hate speech and offensive language detection 2020 1
37 Hate speech and offensive language detection 2023 1
12 Healthcare 2016 1
14 Healthcare 2017 1
33 Healthcare 2022 1
10 Lobbying disclosure analysis 2015 1
3 Multilingual text categorization 2009 2
5 Multilingual text categorization 2010 1
7 Multilingual text categorization 2012 2
11 Multilingual text categorization 2015 1
13 Multilingual text categorization 2016 1
15 Multilingual text categorization 2017 1
18 Multilingual text categorization 2019 1
26 Multilingual text categorization 2020 2
28 Multilingual text categorization 2021 1
1 News Article Classification 2001 1
16 Other multi-view tasks 2018 1
19 Other multi-view tasks 2019 1
34 Personality Prediction 2022 1
20 Sentiment analysis 2019 1
29 Sentiment analysis 2021 1
8 Short text classification 2013 1
35 Short text classification 2022 1
36 Social media analysis 2022 1
38 Social media analysis 2023 2
9 Software document classification 2014 1
30 Technology forecasting 2021 1
2 Web document classification 2008 1
4 Web document classification 2009 3
6 Web document classification 2010 1
21 Web document classification 2019 2
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Tasks") %>% kableExtra::kable_styling()
Table 4.7: Table 4.8: Tasks
group count percentage label column
Adversarial text classification 1 2.2 1 (2.2%) group
Author profiling 1 2.2 1 (2.2%) group
Biomedical document, article analysis 1 2.2 1 (2.2%) group
Emotion recognition 1 2.2 1 (2.2%) group
Extreme multi-label text classification 1 2.2 1 (2.2%) group
Fake news detection 1 2.2 1 (2.2%) group
Finance 1 2.2 1 (2.2%) group
Hate speech and offensive language detection 2 4.4 2 (4.4%) group
Healthcare 3 6.7 3 (6.7%) group
Lobbying disclosure analysis 1 2.2 1 (2.2%) group
Multilingual text categorization 12 26.7 12 (26.7%) group
News Article Classification 1 2.2 1 (2.2%) group
Other multi-view tasks 2 4.4 2 (4.4%) group
Personality Prediction 1 2.2 1 (2.2%) group
Sentiment analysis 2 4.4 2 (4.4%) group
Short text classification 2 4.4 2 (4.4%) group
Social media analysis 3 6.7 3 (6.7%) group
Software document classification 1 2.2 1 (2.2%) group
Technology forecasting 1 2.2 1 (2.2%) group
Web document classification 7 15.6 7 (15.6%) group
# 1. Wrap the title at ~60 characters per line
# Treemap of the Distribution of Works on Multimodal–Task (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mv-task-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

4.1.3 Type of Classification Task

classification_tasks <- list()
classification_tasks[['One-class Classification']] <- c('huc2021', 'chenb2009')
classification_tasks[['Binary Classification']] <- c('graffm2023', 'tianl2023', 'varmanp2023', 'cgoncalves2022', 'karisanip2022', 'liuj2022', 'sangy2022', 'carmona2020', 'lij2020', 'mmironczuk2020', 'akhtiamov2019', 'mmironczuk2019', 'pengj2018', 'huz2017', 'xux2016', 'brefeldu2015', 'liuj2014', 'yangp2014', 'longg2013', 'zhangb2013', 'zhangd2013', 'guyo2012', 'lig2012', 'yangp2012', 'aminim2010', 'aminim2010b', 'suns2010', 'aminim2009', 'suns2008', 'zhangb2008', 'matsubara2005')
classification_tasks[['Multi-class Classification']] <- c('jiz2024', 'xuy2024', 'zhangqi2024', 'samya2023', 'zhao2023', 'luox2022', 'gui2021', 'jiax2021', 'liang2021', 'sus2021', 'zhang2021', 'doinychko2020', 'maf2020', 'max2020', 'wangh2020', 'bhatt2019', 'hey2019', 'hoylea2019', 'wangh2019', 'ferreira2018', 'xuc2017', 'zhanz2017', 'iglesias2016', 'rajendran2016', 'sinorar2016', 'xuh2016', 'fakri2015', 'liy2013', 'perinaa2013', 'kovesim2012', 'zhengw2011', 'zhangx2010b', 'gup2009', 'zhangx2009', 'dasigiv2001')
classification_tasks[['Multi-label Classification']] <- c('fengz2024', 'liuw2021', 'zhang2021', 'chens2019', 'zhup2018', 'liaox2015')


summary_df <- create_bubble_df(year_multiview_df, classification_tasks)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(6))(6),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Type of Classification Task",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Type of Classification Task") %>% kableExtra::kable_styling()
Table 4.9: Table 4.10: Type of Classification Task
group YEAR n
2 Binary Classification 2005 1
3 Binary Classification 2008 2
4 Binary Classification 2009 2
7 Binary Classification 2010 2
10 Binary Classification 2012 3
12 Binary Classification 2013 3
14 Binary Classification 2014 2
15 Binary Classification 2015 1
18 Binary Classification 2016 1
20 Binary Classification 2017 1
22 Binary Classification 2018 1
25 Binary Classification 2019 3
28 Binary Classification 2020 2
33 Binary Classification 2022 5
35 Binary Classification 2023 2
1 Multi-class Classification 2001 1
5 Multi-class Classification 2009 2
8 Multi-class Classification 2010 1
9 Multi-class Classification 2011 1
11 Multi-class Classification 2012 1
13 Multi-class Classification 2013 2
16 Multi-class Classification 2015 1
19 Multi-class Classification 2016 4
21 Multi-class Classification 2017 2
23 Multi-class Classification 2018 1
26 Multi-class Classification 2019 5
29 Multi-class Classification 2020 4
30 Multi-class Classification 2021 5
34 Multi-class Classification 2022 1
36 Multi-class Classification 2023 1
37 Multi-class Classification 2024 3
17 Multi-label Classification 2015 1
24 Multi-label Classification 2018 1
27 Multi-label Classification 2019 1
31 Multi-label Classification 2021 2
38 Multi-label Classification 2024 1
6 One-class Classification 2009 1
32 One-class Classification 2021 1
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Type of Classification Task") %>% kableExtra::kable_styling()
Table 4.11: Table 4.12: Type of Classification Task
group count percentage label column
Binary Classification 31 41.9 31 (41.9%) group
Multi-class Classification 35 47.3 35 (47.3%) group
Multi-label Classification 6 8.1 6 (8.1%) group
One-class Classification 2 2.7 2 (2.7%) group
# 1. Wrap the title
# Treemap of the Distribution of Works on Type of Classification Task (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 70
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mv-class-task-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

4.1.4 Learning Paradigms

learning_paradigms <- list()
learning_paradigms[['Supervised Learning']] <- c('fengz2024', 'graffm2023', 'varmanp2023', 'zhao2023', 'cgoncalves2022', 'luox2022', 'gui2021', 'liang2021', 'liuw2021', 'sus2021', 'zhang2021', 'carmona2020', 'mmironczuk2020', 'akhtiamov2019', 'bhatt2019', 'chens2019', 'mmironczuk2019', 'ferreira2018', 'pengj2018', 'zhup2018', 'huz2017', 'xuc2017', 'zhanz2017', 'sinorar2016', 'xuh2016', 'liaox2015', 'liuj2014', 'liy2013', 'zhangd2013', 'kovesim2012', 'zhengw2011', 'aminim2010', 'zhangx2010b', 'aminim2009', 'dasigiv2001')
learning_paradigms[['Semi-supervised Learning']] <- c('jiz2024', 'zhangqi2024', 'huc2021', 'jiax2021', 'maf2020', 'iglesias2016', 'xux2016', 'brefeldu2015', 'fakri2015', 'longg2013', 'guyo2012', 'lig2012', 'aminim2010b', 'aminim2009', 'chenb2009', 'gup2009', 'suns2008', 'zhangb2008', 'matsubara2005')
learning_paradigms[['Active Learning']] <- c('jiz2024', 'karisanip2022', 'liuj2022', 'suns2010', 'gup2009', 'zhangx2009', 'suns2008')
learning_paradigms[['Transfer Learning']] <- c('fengz2024', 'bhatt2019', 'hey2019', 'rajendran2016', 'yangp2014', 'zhangb2013', 'yangp2012')
learning_paradigms[['Contrastive Learning']] <- c('samya2023')
learning_paradigms[['Graph Learning']] <- c('xuy2024')
learning_paradigms[['Meta Learning']] <- c('tianl2023')
learning_paradigms[['Few-shot Learning']] <- c('tianl2023')
learning_paradigms[['Adversarial Learning']] <- c('doinychko2020', 'lij2020')

summary_df <- create_bubble_df(year_multiview_df, learning_paradigms)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(9))(9),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Learning Paradigms",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Learning Paradigms") %>% kableExtra::kable_styling()
Table 4.13: Table 4.14: Learning Paradigms
group YEAR n
3 Active Learning 2008 1
5 Active Learning 2009 2
8 Active Learning 2010 1
34 Active Learning 2022 2
40 Active Learning 2024 1
29 Adversarial Learning 2020 2
36 Contrastive Learning 2023 1
37 Few-shot Learning 2023 1
41 Graph Learning 2024 1
38 Meta Learning 2023 1
2 Semi-supervised Learning 2005 1
4 Semi-supervised Learning 2008 2
6 Semi-supervised Learning 2009 3
9 Semi-supervised Learning 2010 1
12 Semi-supervised Learning 2012 2
15 Semi-supervised Learning 2013 1
20 Semi-supervised Learning 2015 2
22 Semi-supervised Learning 2016 2
30 Semi-supervised Learning 2020 1
32 Semi-supervised Learning 2021 2
42 Semi-supervised Learning 2024 2
1 Supervised Learning 2001 1
7 Supervised Learning 2009 2
10 Supervised Learning 2010 1
11 Supervised Learning 2011 1
13 Supervised Learning 2012 1
16 Supervised Learning 2013 2
18 Supervised Learning 2014 1
21 Supervised Learning 2015 1
23 Supervised Learning 2016 2
25 Supervised Learning 2017 3
26 Supervised Learning 2018 3
27 Supervised Learning 2019 5
31 Supervised Learning 2020 2
33 Supervised Learning 2021 5
35 Supervised Learning 2022 3
39 Supervised Learning 2023 1
43 Supervised Learning 2024 1
14 Transfer Learning 2012 1
17 Transfer Learning 2013 1
19 Transfer Learning 2014 1
24 Transfer Learning 2016 1
28 Transfer Learning 2019 2
44 Transfer Learning 2024 1
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Learning Paradigms") %>% kableExtra::kable_styling()
Table 4.15: Table 4.16: Learning Paradigms
group count percentage label column
Active Learning 7 9.5 7 (9.5%) group
Adversarial Learning 2 2.7 2 (2.7%) group
Contrastive Learning 1 1.4 1 (1.4%) group
Few-shot Learning 1 1.4 1 (1.4%) group
Graph Learning 1 1.4 1 (1.4%) group
Meta Learning 1 1.4 1 (1.4%) group
Semi-supervised Learning 19 25.7 19 (25.7%) group
Supervised Learning 35 47.3 35 (47.3%) group
Transfer Learning 7 9.5 7 (9.5%) group
# 1. Wrap the title
# Treemap of the Distribution of Works on Learning Paradigms (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mv-learn-para-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

4.1.5 Fusion Strategies

fusion_strategies <- list()
fusion_strategies[['Early Fusion']] <- c('jiz2024', 'xuy2024', 'zhangqi2024', 'samya2023', 'varmanp2023', 'luox2022', 'sangy2022', 'huc2021', 'jiax2021', 'liang2021', 'liuw2021', 'sus2021', 'zhang2021', 'carmona2020', 'lij2020', 'max2020', 'wangh2020', 'bhatt2019', 'wangh2019', 'ferreira2018', 'huz2017', 'xuc2017', 'zhanz2017', 'xuh2016', 'liy2013', 'zhangb2013', 'lig2012', 'zhengw2011', 'dasigiv2001')
fusion_strategies[['Late Fusion']] <- c('graffm2023', 'tianl2023', 'cgoncalves2022', 'gui2021',  'mmironczuk2020', 'akhtiamov2019', 'hoylea2019', 'mmironczuk2019', 'pengj2018', 'zhup2018', 'iglesias2016', 'sinorar2016', 'brefeldu2015', 'fakri2015', 'liaox2015', 'longg2013', 'kovesim2012', 'suns2010', 'zhangx2010b', 'zhangx2009', 'suns2008', 'zhangb2008', 'matsubara2005')
fusion_strategies[['Hybrid Fusion']] <- c('fengz2024','zhao2023', 'karisanip2022', 'liuj2022', 'doinychko2020', 'maf2020', 'chens2019', 'hey2019', 'rajendran2016', 'xux2016', 'liuj2014', 'zhangd2013', 'guyo2012', 'aminim2010', 'aminim2010b', 'chenb2009', 'gup2009')

summary_df <- create_bubble_df(year_multiview_df, fusion_strategies)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(6))(6),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Fusion Strategies",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Fusion Strategies") %>% kableExtra::kable_styling()
Table 4.17: Table 4.18: Fusion Strategies
group YEAR n
1 Early Fusion 2001 1
8 Early Fusion 2011 1
9 Early Fusion 2012 1
12 Early Fusion 2013 2
17 Early Fusion 2016 1
20 Early Fusion 2017 3
21 Early Fusion 2018 1
23 Early Fusion 2019 3
26 Early Fusion 2020 3
28 Early Fusion 2021 6
30 Early Fusion 2022 3
33 Early Fusion 2023 1
35 Early Fusion 2024 3
4 Hybrid Fusion 2009 3
6 Hybrid Fusion 2010 1
10 Hybrid Fusion 2012 1
13 Hybrid Fusion 2013 1
15 Hybrid Fusion 2014 1
18 Hybrid Fusion 2016 2
24 Hybrid Fusion 2019 2
27 Hybrid Fusion 2020 3
31 Hybrid Fusion 2022 2
36 Hybrid Fusion 2024 1
2 Late Fusion 2005 1
3 Late Fusion 2008 2
5 Late Fusion 2009 1
7 Late Fusion 2010 2
11 Late Fusion 2012 1
14 Late Fusion 2013 1
16 Late Fusion 2015 3
19 Late Fusion 2016 2
22 Late Fusion 2018 2
25 Late Fusion 2019 4
29 Late Fusion 2021 1
32 Late Fusion 2022 1
34 Late Fusion 2023 2
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Fusion Strategies") %>% kableExtra::kable_styling()
Table 4.19: Table 4.20: Fusion Strategies
group count percentage label column
Early Fusion 29 42.0 29 (42%) group
Hybrid Fusion 17 24.6 17 (24.6%) group
Late Fusion 23 33.3 23 (33.3%) group
# 1. Wrap the title
# Treemap of the Distribution of Works on Fusion Strategies (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 70
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mv-fusion-start-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

4.1.6 Fusion Techniques

fusion_techniques <- list()
fusion_techniques[['Machine Learning based Fusion']] <- c('graffm2023', 'zhao2023', 'cgoncalves2022', 'huc2021', 'sus2021', 'carmona2020', 'mmironczuk2020', 'wangh2020', 'akhtiamov2019', 'bhatt2019', 'hey2019', 'mmironczuk2019', 'wangh2019', 'pengj2018', 'zhup2018', 'zhanz2017', 'sinorar2016', 'xux2016', 'longg2013', 'zhangb2013', 'zhangd2013', 'lig2012', 'zhengw2011', 'suns2010', 'zhangx2009', 'suns2008')
fusion_techniques[['Probabilistic Fusion']] <- c('fengz2024', 'hoylea2019', 'iglesias2016', 'brefeldu2015', 'fakri2015', 'perinaa2013', 'aminim2010', 'aminim2010b', 'zhangx2010b', 'gup2009', 'zhangb2008')
fusion_techniques[['Neural Network Fusion']] <- c('jiz2024', 'zhangqi2024', 'tianl2023', 'varmanp2023', 'liuj2022', 'luox2022', 'gui2021', 'jiax2021', 'liuw2021', 'doinychko2020', 'lij2020', 'max2020', 'chens2019', 'huz2017', 'xuc2017', 'rajendran2016', 'xuh2016', 'dasigiv2001')
fusion_techniques[['Attention Fusion']] <- c('sangy2022', 'liang2021', 'zhang2021')
fusion_techniques[['Graph Fusion']] <- c('fengz2024', 'jiz2024', 'xuy2024', 'samya2023', 'lig2012')

summary_df <- create_bubble_df(year_multiview_df, fusion_techniques)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(6))(6),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Fusion Techniques",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Fusion Techniques") %>% kableExtra::kable_styling()
Table 4.21: Table 4.22: Fusion Techniques
group YEAR n
25 Attention Fusion 2021 2
28 Attention Fusion 2022 1
9 Graph Fusion 2012 1
31 Graph Fusion 2023 1
34 Graph Fusion 2024 3
2 Machine Learning based Fusion 2008 1
4 Machine Learning based Fusion 2009 1
6 Machine Learning based Fusion 2010 1
8 Machine Learning based Fusion 2011 1
10 Machine Learning based Fusion 2012 1
11 Machine Learning based Fusion 2013 3
14 Machine Learning based Fusion 2016 2
17 Machine Learning based Fusion 2017 1
19 Machine Learning based Fusion 2018 2
20 Machine Learning based Fusion 2019 7
23 Machine Learning based Fusion 2020 2
26 Machine Learning based Fusion 2021 2
29 Machine Learning based Fusion 2022 1
32 Machine Learning based Fusion 2023 1
1 Neural Network Fusion 2001 1
15 Neural Network Fusion 2016 2
18 Neural Network Fusion 2017 2
21 Neural Network Fusion 2019 1
24 Neural Network Fusion 2020 3
27 Neural Network Fusion 2021 3
30 Neural Network Fusion 2022 3
33 Neural Network Fusion 2023 1
35 Neural Network Fusion 2024 2
3 Probabilistic Fusion 2008 1
5 Probabilistic Fusion 2009 2
7 Probabilistic Fusion 2010 2
12 Probabilistic Fusion 2013 1
13 Probabilistic Fusion 2015 2
16 Probabilistic Fusion 2016 1
22 Probabilistic Fusion 2019 1
36 Probabilistic Fusion 2024 1
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Fusion Techniques") %>% kableExtra::kable_styling()
Table 4.23: Table 4.24: Fusion Techniques
group count percentage label column
Attention Fusion 3 4.8 3 (4.8%) group
Graph Fusion 5 7.9 5 (7.9%) group
Machine Learning based Fusion 26 41.3 26 (41.3%) group
Neural Network Fusion 18 28.6 18 (28.6%) group
Probabilistic Fusion 11 17.5 11 (17.5%) group
# 1. Wrap the title
# Treemap of the Distribution of Works on Fusion Techniques (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 70
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mv-fusion-tech-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

4.1.7 Datasets

datasets <- list()
datasets[['Public Benchmark Datasets']] <- c('fengz2024', 'jiz2024', 'xuy2024', 'zhangqi2024', 'samya2023', 'varmanp2023', 'luox2022', 'huc2021', 'jiax2021', 'liang2021', 'liuw2021', 'zhang2021', 'doinychko2020', 'max2020', 'wangh2020', 'bhatt2019', 'chens2019', 'hey2019', 'hoylea2019', 'wangh2019', 'ferreira2018', 'pengj2018', 'zhup2018', 'xuc2017', 'iglesias2016', 'rajendran2016', 'sinorar2016', 'xuh2016', 'xux2016', 'brefeldu2015', 'fakri2015', 'liuj2014', 'yangp2014', 'liy2013', 'longg2013', 'perinaa2013', 'zhangb2013', 'zhangd2013', 'yangp2012', 'zhengw2011', 'aminim2010', 'aminim2010b', 'aminim2009', 'chenb2009', 'gup2009', 'zhangx2009', 'zhangb2008', 'matsubara2005', 'dasigiv2001')
datasets[['Domain-specific Datasets']] <- c('graffm2023', 'zhao2023', 'cgoncalves2022', 'liuj2022', 'sangy2022', 'gui2021', 'carmona2020', 'lij2020', 'mmironczuk2020', 'akhtiamov2019', 'mmironczuk2019', 'huz2017', 'liaox2015', 'suns2010', 'suns2008')
datasets[['Multilingual Datasets']] <- c('tianl2023', 'sus2021', 'carmona2020', 'maf2020', 'maf2020', 'bhatt2019', 'zhanz2017', 'guyo2012', 'kovesim2012', 'aminim2010', 'aminim2010b', 'aminim2009')

summary_df <- create_bubble_df(year_multiview_df, datasets)

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(6))(6),
                                  unique(summary_df$group))

bubble_plot <- create_bubble_plot(summary_df, 
                  title = "Datasets",
                  subtitle = "Distribution of papers by category and publication year",
                  global_palette = global_palette)
bubble_plot

knitr::kable(summary_df, caption = "Datasets") %>% kableExtra::kable_styling()
Table 4.25: Table 4.26: Datasets
group YEAR n
3 Domain-specific Datasets 2008 1
7 Domain-specific Datasets 2010 1
15 Domain-specific Datasets 2015 1
18 Domain-specific Datasets 2017 1
22 Domain-specific Datasets 2019 3
25 Domain-specific Datasets 2020 3
28 Domain-specific Datasets 2021 1
31 Domain-specific Datasets 2022 3
33 Domain-specific Datasets 2023 1
5 Multilingual Datasets 2009 2
8 Multilingual Datasets 2010 1
11 Multilingual Datasets 2012 2
19 Multilingual Datasets 2017 1
23 Multilingual Datasets 2019 1
26 Multilingual Datasets 2020 3
29 Multilingual Datasets 2021 1
34 Multilingual Datasets 2023 1
1 Public Benchmark Datasets 2001 1
2 Public Benchmark Datasets 2005 1
4 Public Benchmark Datasets 2008 1
6 Public Benchmark Datasets 2009 5
9 Public Benchmark Datasets 2010 1
10 Public Benchmark Datasets 2011 1
12 Public Benchmark Datasets 2012 1
13 Public Benchmark Datasets 2013 5
14 Public Benchmark Datasets 2014 2
16 Public Benchmark Datasets 2015 2
17 Public Benchmark Datasets 2016 5
20 Public Benchmark Datasets 2017 1
21 Public Benchmark Datasets 2018 3
24 Public Benchmark Datasets 2019 6
27 Public Benchmark Datasets 2020 2
30 Public Benchmark Datasets 2021 5
32 Public Benchmark Datasets 2022 2
35 Public Benchmark Datasets 2023 1
36 Public Benchmark Datasets 2024 4
unpacked_df <- summary_df[rep(row.names(summary_df), summary_df$n), c("group", "YEAR")]

unpacked_summary_df <- create_summary_df(unpacked_df, "group")

ggplot_chart <- create_treemap_chart(
  summary_df = unpacked_summary_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)
ggplot_chart

knitr::kable(unpacked_summary_df, caption = "Datasets") %>% kableExtra::kable_styling()
Table 4.27: Table 4.28: Datasets
group count percentage label column
Domain-specific Datasets 15 19.7 15 (19.7%) group
Multilingual Datasets 12 15.8 12 (15.8%) group
Public Benchmark Datasets 49 64.5 49 (64.5%) group
# 1. Wrap the title
# Treemap of the Distribution of Works on Datasets (Left), and Annual Distribution of These Works (Right)
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

title_grob <- grid::textGrob(wrapped_title, gp = grid::gpar(fontsize = 16, fontface = "bold"), hjust = 0.5)

# 2. Extract the FILL legend (from treemap)
treemap_for_fill_legend <- ggplot_chart +
  guides(
    fill = guide_legend(
      ncol = 8, 
      byrow = TRUE,
      keywidth = unit(0.6, "cm"),
      keyheight = unit(0.6, "cm"),
      title = "Group",
      title.position = "top", 
      title.hjust = 0.5 
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal", 
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 8), 
    legend.spacing.x = unit(0.15, "cm"),
    legend.spacing.y = unit(0.1, "cm"),
    legend.background = element_rect(fill="transparent", colour = NA), # Ensure no box around fill legend either
    legend.box.background = element_rect(fill="transparent", colour = NA),
    legend.box.margin = margin(t = 5, r = 5, b = 5, l = 0) # Margin for fill legend
  )

g_fill <- ggplotGrob(treemap_for_fill_legend)
fill_legend_grob <- g_fill$grobs[[which(sapply(g_fill$grobs, function(x) x$name) == "guide-box")]]

# 3. Extract the SIZE legend (from bubble plot, horizontal, no box)
bubble_plot_for_size_legend <- bubble_plot +
  guides(
    fill = "none", 
    size = guide_legend(
      title = "Count",
      direction = "horizontal",
      title.position = "top",    
      label.position = "bottom", 
      keywidth = unit(1.2, "cm"), 
      keyheight = unit(0.8, "cm"),
      label.hjust = 0.5,         
      title.hjust = 0.5,         
      override.aes = list(fill = "grey70", color = NA, alpha = 0.8, shape = 21) # color = NA for no border on keys
    )
  ) +
  theme(
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.box = "horizontal",
    legend.title = element_text(size = 10, face = "plain"),
    legend.text = element_text(size = 9),
    legend.background = element_rect(fill="transparent", colour = NA), # No fill, no border for overall legend
    legend.box.background = element_rect(fill="transparent", colour = NA), # No fill, no border for the guide box
    legend.key = element_rect(fill="transparent", colour = NA), # No fill/border for individual key backgrounds
    legend.spacing.x = unit(0.1, "cm"), # Tighter spacing for horizontal legend
    legend.box.margin = margin(t = 5, r = 0, b = 5, l = 5) # Margin for size legend
  )

g_size <- ggplotGrob(bubble_plot_for_size_legend)
size_legend_grob <- g_size$grobs[[which(sapply(g_size$grobs, function(x) x$name) == "guide-box")]]

# Check if legends were found
if(is.null(fill_legend_grob)) stop("Fill legend not found!")
if(is.null(size_legend_grob)) stop("Size legend not found!")

# 4. Combine the two extracted legends HORIZONTALLY
#    Give fill legend more space.
combined_legends_grob <- arrangeGrob(
  fill_legend_grob,
  size_legend_grob,
  ncol = 2,
  widths = unit(c(0.65, 0.35), "npc") # Fill legend gets 65%, Size legend gets 35% of the horizontal space for legends
                                      # Adjust these proportions as needed.
)

# 5. Remove legends from the original plots for the main panel
ggplot_chart_no_legend <- ggplot_chart + theme(legend.position = "none")
bubble_plot_no_legend <- bubble_plot + 
 theme(legend.position = "none",
       plot.title = element_blank(), 
       plot.subtitle = element_blank())

# 6. Arrange the plots in a 2-column layout
plots_row <- arrangeGrob(
  ggplot_chart_no_legend,
  bubble_plot_no_legend,
  ncol = 2
)

# 7. Create the combined plot 
#    The height of the legend panel will be determined by the taller of the two (multi-row fill legend)
title_height_est <- grobHeight(title_grob) + unit(0.2, "inches") 

# Calculate height of the combined horizontal legend panel
# It will be the height of the fill legend (which is multi-row)
legend_panel_actual_height <- grobHeight(fill_legend_grob) 
final_legend_panel_height <- legend_panel_actual_height + unit(0.3, "inches") # Add padding

combined_plot_final <- grid.arrange(
  title_grob,
  plots_row,
  combined_legends_grob,
  ncol = 1,
  heights = unit.c(title_height_est,
                   unit(1, "null"), 
                   final_legend_panel_height) 
)

ggsave(
  "fig-mv-datasets-pie-bubble.pdf",
  plot = combined_plot_final,
  width = 14,
  height = 8
)

4.2 Datasets, Models, and Performance metrics selected for comparison

GG_PLOTS_REGISTER <- list()

# Rename the columns to make them easier to work with
df_renamed <- multiview_df %>%
  rename(
    datasets = 'Identified the datasets used in the article',
    dis_datasets = 'Disambiguated datasets names',
    models = 'What other models were selected for comparison?',
    dis_models = 'Disambiguated models names',
    metrics = 'Identified performance metrics used in the article',
    dis_metrics = 'Disambiguated performance metrics names'
  )

# Create better titles for each column
title_mapping <- c(
  "dis_datasets" = "Disambiguated datasets names",
  "dis_models" = "Disambiguated models names",
  "dis_metrics" = "Disambiguated performance metrics names"
)

4.2.1 Datasets

# Clean and standardize responses - convert everything to lower case and handle variations
dis_df <- df_renamed %>%
  select('l.p', 'dis_datasets') %>%
  separate_rows('dis_datasets', sep = "\\r?\\n") %>%
  mutate(
    dis_datasets = stringr::str_to_lower(dis_datasets),
    dis_datasets = stringr::str_trim(dis_datasets),
    dis_datasets = na_if(dis_datasets, "")
  )

summary_df <- create_summary_df(dis_df, 'dis_datasets') %>%
  arrange(desc(count)) %>% filter(!is.na(group))

summary_cuted_df <- summary_df %>%
  # 1) lump any low‑frequency group into "other"
  mutate(group = if_else(count < 5, "other", group)) %>%
  # 2) re‑aggregate by the (possibly new) group
  group_by(group, column) %>%
  summarise(count = sum(count), .groups = "drop") %>%
  # 3) recompute percentages and labels over the new totals
  mutate(percentage = count / sum(count) * 100,
         label      = paste0(count, " \n (", round(percentage, 1), "%)")) %>%
  # 4) order by descending count
  arrange(desc(count))

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(7))(7),
                                  unique(summary_cuted_df$group))

ggplot_chart <- create_treemap_chart(
  summary_df = summary_cuted_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)

ggsave(
  "fig-mv-eval-datasets-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['dis_datasets']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['dis_datasets']) %>% kableExtra::kable_styling()
Table 4.29: Table 4.30: Disambiguated datasets names
group count percentage label column
reuters corpora 29 11.9 29 (11.9%) dis_datasets
20 newsgroups 12 4.9 12 (4.9%) dis_datasets
webkb 8 3.3 8 (3.3%) dis_datasets
imdb movie reviews 6 2.5 6 (2.5%) dis_datasets
yelp reviews 5 2.1 5 (2.1%) dis_datasets
ag news 4 1.6 4 (1.6%) dis_datasets
amazon reviews 4 1.6 4 (1.6%) dis_datasets
cora 4 1.6 4 (1.6%) dis_datasets
mr 4 1.6 4 (1.6%) dis_datasets
twitter data 4 1.6 4 (1.6%) dis_datasets
yale faces 4 1.6 4 (1.6%) dis_datasets
bbc news 3 1.2 3 (1.2%) dis_datasets
corel image collections 3 1.2 3 (1.2%) dis_datasets
nus-wide dataset family 3 1.2 3 (1.2%) dis_datasets
ohsumed corpus 3 1.2 3 (1.2%) dis_datasets
3sources 2 0.8 2 (0.8%) dis_datasets
github 2 0.8 2 (0.8%) dis_datasets
holidays image dataset 2 0.8 2 (0.8%) dis_datasets
inventorum 2 0.8 2 (0.8%) dis_datasets
mnist 2 0.8 2 (0.8%) dis_datasets
sogou news/text 2 0.8 2 (0.8%) dis_datasets
thucnews 2 0.8 2 (0.8%) dis_datasets
trec question/text classification 2 0.8 2 (0.8%) dis_datasets
web of science 2 0.8 2 (0.8%) dis_datasets
wikipedia data 2 0.8 2 (0.8%) dis_datasets
aapd (arxiv academic paper dataset) 1 0.4 1 (0.4%) dis_datasets
abusive ugc (abuse) 1 0.4 1 (0.4%) dis_datasets
acl 1 0.4 1 (0.4%) dis_datasets
ads data set 1 0.4 1 (0.4%) dis_datasets
adverse drug reactions (adr) dataset 1 0.4 1 (0.4%) dis_datasets
aep 1 0.4 1 (0.4%) dis_datasets
air 1 0.4 1 (0.4%) dis_datasets
aloi 1 0.4 1 (0.4%) dis_datasets
animal 1 0.4 1 (0.4%) dis_datasets
arem 1 0.4 1 (0.4%) dis_datasets
australian 1 0.4 1 (0.4%) dis_datasets
bioid 1 0.4 1 (0.4%) dis_datasets
biometrics data 1 0.4 1 (0.4%) dis_datasets
breast cancer 1 0.4 1 (0.4%) dis_datasets
bs-top4 1 0.4 1 (0.4%) dis_datasets
caltech101-20 1 0.4 1 (0.4%) dis_datasets
citeseer data 1 0.4 1 (0.4%) dis_datasets
clinical interview fragments (motivational interviewing-based weight loss sessions) 1 0.4 1 (0.4%) dis_datasets
collab 1 0.4 1 (0.4%) dis_datasets
comparable multilingual corpus 1 0.4 1 (0.4%) dis_datasets
cub 1 0.4 1 (0.4%) dis_datasets
customer review (cr) 1 0.4 1 (0.4%) dis_datasets
da-vincis challenge data 1 0.4 1 (0.4%) dis_datasets
dd 1 0.4 1 (0.4%) dis_datasets
deezer 1 0.4 1 (0.4%) dis_datasets
digits data 1 0.4 1 (0.4%) dis_datasets
douban movie short comments (dmsc) 1 0.4 1 (0.4%) dis_datasets
east money 1 0.4 1 (0.4%) dis_datasets
elec 1 0.4 1 (0.4%) dis_datasets
emotion classification (ec) 1 0.4 1 (0.4%) dis_datasets
emotion text (twt-13) 1 0.4 1 (0.4%) dis_datasets
enzymes 1 0.4 1 (0.4%) dis_datasets
espgame 1 0.4 1 (0.4%) dis_datasets
eurlex-4k 1 0.4 1 (0.4%) dis_datasets
europarl v7 1 0.4 1 (0.4%) dis_datasets
facebook 1 0.4 1 (0.4%) dis_datasets
fakenews 1 0.4 1 (0.4%) dis_datasets
fudan 1 0.4 1 (0.4%) dis_datasets
gas 1 0.4 1 (0.4%) dis_datasets
haodf online dataset 1 0.4 1 (0.4%) dis_datasets
hate speech (hate-3) 1 0.4 1 (0.4%) dis_datasets
health-related forum dataset (approximately 2 million sentences)<U+200B> 1 0.4 1 (0.4%) dis_datasets
iaprtc12 1 0.4 1 (0.4%) dis_datasets
iclr 1 0.4 1 (0.4%) dis_datasets
iflytek 1 0.4 1 (0.4%) dis_datasets
image recognition dataset 1 0.4 1 (0.4%) dis_datasets
inews 1 0.4 1 (0.4%) dis_datasets
insider threat detection (itd) 1 0.4 1 (0.4%) dis_datasets
internet advertisements (ad) 1 0.4 1 (0.4%) dis_datasets
internet movie script database (imsdb) 1 0.4 1 (0.4%) dis_datasets
ionosphere 1 0.4 1 (0.4%) dis_datasets
kaggle 1 0.4 1 (0.4%) dis_datasets
lnai dataset (lecture notes in artificial intelligence dataset). 1 0.4 1 (0.4%) dis_datasets
lobbying disclosure act (lda) dataset 1 0.4 1 (0.4%) dis_datasets
lpa 1 0.4 1 (0.4%) dis_datasets
mag-cs 1 0.4 1 (0.4%) dis_datasets
mastercook recipes dataset 1 0.4 1 (0.4%) dis_datasets
mcintire 1 0.4 1 (0.4%) dis_datasets
medline citations 1 0.4 1 (0.4%) dis_datasets
mexican twitter corpus (mex-a3t-500) 1 0.4 1 (0.4%) dis_datasets
mirflickr 1 0.4 1 (0.4%) dis_datasets
movie review (mr) 1 0.4 1 (0.4%) dis_datasets
mscoco dataset 1 0.4 1 (0.4%) dis_datasets
multidom 1 0.4 1 (0.4%) dis_datasets
multilingual ted corpus 1 0.4 1 (0.4%) dis_datasets
multilingual text (mtext) data 1 0.4 1 (0.4%) dis_datasets
multimedia web image-text database 1 0.4 1 (0.4%) dis_datasets
narrativeqa 1 0.4 1 (0.4%) dis_datasets
nasdaq 1 0.4 1 (0.4%) dis_datasets
nci1 1 0.4 1 (0.4%) dis_datasets
newswire articles written in 5 languages (english, french, german, italian, spanish) 1 0.4 1 (0.4%) dis_datasets
nytimes (nyt) 1 0.4 1 (0.4%) dis_datasets
object detection dataset (pascal voc 2007) 1 0.4 1 (0.4%) dis_datasets
observation dataset 1 0.4 1 (0.4%) dis_datasets
office-caltech 1 0.4 1 (0.4%) dis_datasets
orl 1 0.4 1 (0.4%) dis_datasets
out-scene 1 0.4 1 (0.4%) dis_datasets
page-blocks 1 0.4 1 (0.4%) dis_datasets
pascal07 1 0.4 1 (0.4%) dis_datasets
people 1 0.4 1 (0.4%) dis_datasets
person re-identification dataset 1 0.4 1 (0.4%) dis_datasets
pima 1 0.4 1 (0.4%) dis_datasets
pm 1 0.4 1 (0.4%) dis_datasets
pornographic ugc (porn) 1 0.4 1 (0.4%) dis_datasets
product consumption dataset 1 0.4 1 (0.4%) dis_datasets
product review data set 1 0.4 1 (0.4%) dis_datasets
proteins 1 0.4 1 (0.4%) dis_datasets
pubmed 1 0.4 1 (0.4%) dis_datasets
rec 1 0.4 1 (0.4%) dis_datasets
reddit data 1 0.4 1 (0.4%) dis_datasets
rumour detection dataset 1 0.4 1 (0.4%) dis_datasets
same-3 dataset 1 0.4 1 (0.4%) dis_datasets
scy-cluster 1 0.4 1 (0.4%) dis_datasets
scy-genes 1 0.4 1 (0.4%) dis_datasets
se-absa15 1 0.4 1 (0.4%) dis_datasets
search snippets dataset 1 0.4 1 (0.4%) dis_datasets
semeval 1 0.4 1 (0.4%) dis_datasets
smart video corpus (svc) 1 0.4 1 (0.4%) dis_datasets
snippets 1 0.4 1 (0.4%) dis_datasets
spambase 1 0.4 1 (0.4%) dis_datasets
stanford sentiment treebank (sst) 1 0.4 1 (0.4%) dis_datasets
story2personality 1 0.4 1 (0.4%) dis_datasets
subjectivity dataset (subj) 1 0.4 1 (0.4%) dis_datasets
synthetic data set 1 0.4 1 (0.4%) dis_datasets
synthetic datasets 1 0.4 1 (0.4%) dis_datasets
tagmynews 1 0.4 1 (0.4%) dis_datasets
the article does not explicitly mention the names of any specific datasets used. 1 0.4 1 (0.4%) dis_datasets
the personality database (pdb) 1 0.4 1 (0.4%) dis_datasets
tiger corpus 1 0.4 1 (0.4%) dis_datasets
topic 1 0.4 1 (0.4%) dis_datasets
toutiao 1 0.4 1 (0.4%) dis_datasets
tweet 1 0.4 1 (0.4%) dis_datasets
uid 1 0.4 1 (0.4%) dis_datasets
umist 1 0.4 1 (0.4%) dis_datasets
w-1000 1 0.4 1 (0.4%) dis_datasets
w-200 1 0.4 1 (0.4%) dis_datasets
w-2101 1 0.4 1 (0.4%) dis_datasets
waveform 1 0.4 1 (0.4%) dis_datasets
web advertisement images dataset 1 0.4 1 (0.4%) dis_datasets
web of science (wos) 1 0.4 1 (0.4%) dis_datasets
welfake 1 0.4 1 (0.4%) dis_datasets
wiki-30k 1 0.4 1 (0.4%) dis_datasets
wikics 1 0.4 1 (0.4%) dis_datasets
wikiner corpus 1 0.4 1 (0.4%) dis_datasets
x-ray microbeam speech data (xrmb) 1 0.4 1 (0.4%) dis_datasets
yeast data 1 0.4 1 (0.4%) dis_datasets

4.2.2 Models selected for comparison

# Clean and standardize responses - convert everything to lower case and handle variations
dis_df <- df_renamed %>%
  select('l.p', 'dis_models') %>%
  separate_rows('dis_models', sep = "\\r?\\n") %>%
  mutate(
    dis_models = stringr::str_to_lower(dis_models),
    dis_models = stringr::str_trim(dis_models),
    dis_models = na_if(dis_models, "")
  )

summary_df <- create_summary_df(dis_df, 'dis_models') %>%
  arrange(desc(count)) %>% filter(!is.na(group))

summary_cuted_df <- summary_df %>%
  # 1) lump any low‑frequency group into "other"
  mutate(group = if_else(count < 20, "other", group)) %>%
  # 2) re‑aggregate by the (possibly new) group
  group_by(group, column) %>%
  summarise(count = sum(count), .groups = "drop") %>%
  # 3) recompute percentages and labels over the new totals
  mutate(percentage = count / sum(count) * 100,
         label      = paste0(count, " (", round(percentage, 1), "%)")) %>%
  # 4) order by descending count
  arrange(desc(count))

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(7))(7),
                                  unique(summary_cuted_df$group))

ggplot_chart <- create_treemap_chart(
  summary_df = summary_cuted_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)

ggsave(
  "fig-mv-eval-models-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['dis_models']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['dis_models']) %>% kableExtra::kable_styling()
Table 4.31: Table 4.32: Disambiguated models names
group count percentage label column
multi-modal/multi-view specific architectures & techniques 85 23.2 85 (23.2%) dis_models
support vector machine (svm) 50 13.6 50 (13.6%) dis_models
traditional/statistical machine learning & other methods 35 9.5 35 (9.5%) dis_models
graph neural networks (gnn) 24 6.5 24 (6.5%) dis_models
word embeddings and text representation 18 4.9 18 (4.9%) dis_models
bert (bidirectional encoder representations from transformers) 15 4.1 15 (4.1%) dis_models
convolutional neural networks (cnn) 14 3.8 14 (3.8%) dis_models
boosting methods 13 3.5 13 (3.5%) dis_models
long short-term memory (lstm) 13 3.5 13 (3.5%) dis_models
naive bayes 13 3.5 13 (3.5%) dis_models
correlation analysis (cca) family 10 2.7 10 (2.7%) dis_models
k-nearest neighbors (knn) 9 2.5 9 (2.5%) dis_models
self-attention / attention 8 2.2 8 (2.2%) dis_models
decision tree 5 1.4 5 (1.4%) dis_models
logistic regression 5 1.4 5 (1.4%) dis_models
gated recurrent unit (gru) 4 1.1 4 (1.1%) dis_models
graph attention network (gat) 4 1.1 4 (1.1%) dis_models
graph convolutional network (gcn) 4 1.1 4 (1.1%) dis_models
neural network 4 1.1 4 (1.1%) dis_models
recurrent neural network (rnn) 4 1.1 4 (1.1%) dis_models
autoencoders 2 0.5 2 (0.5%) dis_models
capsule network 2 0.5 2 (0.5%) dis_models
random forest 2 0.5 2 (0.5%) dis_models
traditional/statistical machine learning & other methods<U+200B> 2 0.5 2 (0.5%) dis_models
transformer-based architectures 2 0.5 2 (0.5%) dis_models
adbert 1 0.3 1 (0.3%) dis_models
alexnet 1 0.3 1 (0.3%) dis_models
arima 1 0.3 1 (0.3%) dis_models
bayesian network (bayes) 1 0.3 1 (0.3%) dis_models
fasttext 1 0.3 1 (0.3%) dis_models
latent dirichlet allocation (lda) 1 0.3 1 (0.3%) dis_models
latent semantic analysis (lsa) 1 0.3 1 (0.3%) dis_models
logistic regression (lg) 1 0.3 1 (0.3%) dis_models
long short term memory (lstm) 1 0.3 1 (0.3%) dis_models
roberta (robustly optimized bert pretraining approach) 1 0.3 1 (0.3%) dis_models
support vector machines (svm) 1 0.3 1 (0.3%) dis_models
textrcnn 1 0.3 1 (0.3%) dis_models
xgboost 1 0.3 1 (0.3%) dis_models

4.2.3 Performance metrics

# Clean and standardize responses - convert everything to lower case and handle variations
dis_df <- df_renamed %>%
  select('l.p', 'dis_metrics') %>%
  separate_rows('dis_metrics', sep = "\\r?\\n") %>%
  mutate(
    dis_metrics = stringr::str_to_lower(dis_metrics),
    dis_metrics = stringr::str_trim(dis_metrics),
    dis_metrics = na_if(dis_metrics, "")
  )

summary_df <- create_summary_df(dis_df, 'dis_metrics') %>%
  arrange(desc(count)) %>% filter(!is.na(group))

summary_cuted_df <- summary_df %>%
  # 1) lump any low‑frequency group into "other"
  mutate(group = if_else(count < 11, "other", group)) %>%
  # 2) re‑aggregate by the (possibly new) group
  group_by(group, column) %>%
  summarise(count = sum(count), .groups = "drop") %>%
  # 3) recompute percentages and labels over the new totals
  mutate(percentage = count / sum(count) * 100,
         label      = paste0(count, " (", round(percentage, 1), "%)")) %>%
  # 4) order by descending count
  arrange(desc(count))

global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(7))(7),
                                  unique(summary_cuted_df$group))

ggplot_chart <- create_treemap_chart(
  summary_df = summary_cuted_df,
  title = "",
  subtitle = "",
  global_palette = global_palette,
  text_size = 11  # Adjust as needed
)

ggsave(
  "fig-mv-eval-per-met-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['dis_metrics']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['dis_metrics']) %>% kableExtra::kable_styling()
Table 4.33: Table 4.34: Disambiguated performance metrics names
group count percentage label column
accuracy 49 22.2 49 (22.2%) dis_metrics
f1-score 34 15.4 34 (15.4%) dis_metrics
precision 20 9.0 20 (9%) dis_metrics
recall 17 7.7 17 (7.7%) dis_metrics
area under the roc curve (auc/auroc) 9 4.1 9 (4.1%) dis_metrics
classification error rate 7 3.2 7 (3.2%) dis_metrics
other specific metrics 7 3.2 7 (3.2%) dis_metrics
hierarchical classification metrics 6 2.7 6 (2.7%) dis_metrics
evaluation aspects/concepts 5 2.3 5 (2.3%) dis_metrics
macro f1-score 5 2.3 5 (2.3%) dis_metrics
mean average precision (map) 5 2.3 5 (2.3%) dis_metrics
statistical tests 5 2.3 5 (2.3%) dis_metrics
micro f1-score 4 1.8 4 (1.8%) dis_metrics
variability and confidence metrics 4 1.8 4 (1.8%) dis_metrics
computational and resource metrics 3 1.4 3 (1.4%) dis_metrics
correlation coefficients 3 1.4 3 (1.4%) dis_metrics
distance and similarity metrics 3 1.4 3 (1.4%) dis_metrics
average precision (ap) 2 0.9 2 (0.9%) dis_metrics
hamming loss 2 0.9 2 (0.9%) dis_metrics
kappa statistics 2 0.9 2 (0.9%) dis_metrics
confusion matrix 1 0.5 1 (0.5%) dis_metrics
convergence rate 1 0.5 1 (0.5%) dis_metrics
cross-entropy loss 1 0.5 1 (0.5%) dis_metrics
error 1 0.5 1 (0.5%) dis_metrics
error rate 1 0.5 1 (0.5%) dis_metrics
generalization capability 1 0.5 1 (0.5%) dis_metrics
log-likelihood 1 0.5 1 (0.5%) dis_metrics
loss (general) 1 0.5 1 (0.5%) dis_metrics
macro-averaging 1 0.5 1 (0.5%) dis_metrics
macro-f1 score 1 0.5 1 (0.5%) dis_metrics
mean absolute percentage error (mape) 1 0.5 1 (0.5%) dis_metrics
mean error 1 0.5 1 (0.5%) dis_metrics
mean reciprocal rank (mrr) 1 0.5 1 (0.5%) dis_metrics
mean square error (mse) 1 0.5 1 (0.5%) dis_metrics
micro-averaging 1 0.5 1 (0.5%) dis_metrics
nlp-specific metrics 1 0.5 1 (0.5%) dis_metrics
normalized discounted cumulative gain (ndcg) 1 0.5 1 (0.5%) dis_metrics
pixel error 1 0.5 1 (0.5%) dis_metrics
precision at k () 1 0.5 1 (0.5%) dis_metrics
recall at k () 1 0.5 1 (0.5%) dis_metrics
recall rate 1 0.5 1 (0.5%) dis_metrics
recall<U+200B> 1 0.5 1 (0.5%) dis_metrics
root mean square error (rmse) 1 0.5 1 (0.5%) dis_metrics
stability 1 0.5 1 (0.5%) dis_metrics
test performance 1 0.5 1 (0.5%) dis_metrics
unweighted average recall (uar) 1 0.5 1 (0.5%) dis_metrics
word error rate (wer) 1 0.5 1 (0.5%) dis_metrics

4.2.4 All plots

# Treemap charts of the distribution of datasets, models and performance metrixes used in multiview works
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

title_grob <- grid::textGrob(wrapped_title,
                             gp = grid::gpar(fontsize = 14, fontface = "plain"),
                             just = "center")

layout <- rbind(
  c(1, 1),
  c(2, 3),
  c(4, 4)
)

combined_plot <- gridExtra::grid.arrange(
  title_grob,
  GG_PLOTS_REGISTER[['dis_datasets']], GG_PLOTS_REGISTER[['dis_metrics']], GG_PLOTS_REGISTER[['dis_models']],
  layout_matrix = layout,
  heights       = c(1, 10, 10)  # tweak these ratios to taste
)

ggsave(
  "fig-mv-datasets-models-met-pie.pdf",
  plot = combined_plot,
  width = 14,
  height = 8
)
plot(combined_plot)

4.3 Validation procedure

GG_PLOTS_REGISTER <- list()

# Rename the columns to make them easier to work with
df_renamed <- multiview_df %>%
  rename(
    test_set_used = 'Was a test set or cross-validation procedure used to determine the performance metrics values? Please write only yes, or no.',
    statistical_tests = 'Were statistical tests used? Please write only yes, or no.',
    other_analysis_methods = 'Did other methods of analyzing the outcomes, for example Bayesian, explainable machine learning methods, be used? Please write yes, or no.',
    ablation_studies = 'Were ablation studies and/or comparisons with unimodal classifiers performed? Please write yes, or no.',
    replication_info = 'Did the authors provide enough information (data, code) to allow for replication of the study? Please write only yes, or no.'
  )

# Clean and standardize responses - convert everything to lower case and handle variations
df_clean <- df_renamed %>%
  mutate(across(
    c(
      test_set_used,
      statistical_tests,
      other_analysis_methods,
      ablation_studies,
      replication_info
    ),
    ~ case_when(
      tolower(.) %in% c("yes", "y") ~ "Yes",
      tolower(.) %in% c("no", "n") ~ "No",
      is.na(.) ~ "No",
      TRUE ~ "No"
    )
  ))

# Create better titles for each column
title_mapping <- c(
  "test_set_used" = "Test Set or Cross-Validation Used",
  "statistical_tests" = "Statistical Tests Used",
  "other_analysis_methods" = "Alternative Analysis Methods Used",
  "ablation_studies" = "Ablation Studies Performed",
  "replication_info" = "Replication Information Provided"
)
summary_df <- create_summary_df(df_clean, 'test_set_used')
global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(2))(2),
                                  unique(summary_df$group))

ggplot_chart <- create_pie_chart(summary_df, title_mapping['test_set_used'], global_palette = global_palette)

ggsave(
  "fig-mv-eval-test-set-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['test_set_used']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['test_set_used']) %>% kableExtra::kable_styling()
Table 4.35: Table 4.36: Test Set or Cross-Validation Used
group count percentage label column
No 1 1.4 1 (1.4%) test_set_used
Yes 72 98.6 72 (98.6%) test_set_used
summary_df <- create_summary_df(df_clean, 'statistical_tests')
global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(2))(2),
                                  unique(summary_df$group))
ggplot_chart <- create_pie_chart(summary_df, title_mapping['statistical_tests'], global_palette = global_palette)
  
ggsave("fig-mv-eval-stat-test-pie.pdf", plot = ggplot_chart, width = 14, height = 8)

GG_PLOTS_REGISTER[['statistical_tests']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['statistical_tests']) %>% kableExtra::kable_styling()
Table 4.37: Table 4.38: Statistical Tests Used
group count percentage label column
No 56 76.7 56 (76.7%) statistical_tests
Yes 17 23.3 17 (23.3%) statistical_tests
summary_df <- create_summary_df(df_clean, 'other_analysis_methods')
global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(2))(2),
                                  unique(summary_df$group))
ggplot_chart <- create_pie_chart(summary_df, title_mapping['other_analysis_methods'], global_palette = global_palette)

ggsave(
  "fig-mv-eval-other-test-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['other_analysis_methods']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['other_analysis_methods']) %>% kableExtra::kable_styling()
Table 4.39: Table 4.40: Alternative Analysis Methods Used
group count percentage label column
No 73 100 73 (100%) other_analysis_methods
summary_df <- create_summary_df(df_clean, 'ablation_studies')
global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(2))(2),
                                  unique(summary_df$group))
ggplot_chart <- create_pie_chart(summary_df, title_mapping['ablation_studies'], global_palette = global_palette)

ggsave(
  "fig-mv-eval-ablation-study-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['ablation_studies']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['ablation_studies']) %>% kableExtra::kable_styling()
Table 4.41: Table 4.42: Ablation Studies Performed
group count percentage label column
No 14 19.2 14 (19.2%) ablation_studies
Yes 59 80.8 59 (80.8%) ablation_studies
summary_df <- create_summary_df(df_clean, 'replication_info')
global_palette <- stats::setNames(grDevices::colorRampPalette(ggsci::pal_jco("default")(2))(2),
                                  unique(summary_df$group))
ggplot_chart <- create_pie_chart(summary_df, title_mapping['replication_info'], global_palette = global_palette)

ggsave(
  "fig-mv-eval-replication-pie.pdf",
  plot = ggplot_chart,
  width = 14,
  height = 8
)

GG_PLOTS_REGISTER[['replication_info']] <- ggplot_chart
ggplot_chart

knitr::kable(summary_df, caption = title_mapping['replication_info']) %>% kableExtra::kable_styling()
Table 4.43: Table 4.44: Replication Information Provided
group count percentage label column
No 61 83.6 61 (83.6%) replication_info
Yes 12 16.4 12 (16.4%) replication_info

4.3.1 All plots

# Pie charts of studies evaluation & replication
wrapped_title <- stringr::str_wrap(
  "",
  width = 60
)

title_grob <- grid::textGrob(wrapped_title,
                             gp = grid::gpar(fontsize = 14, fontface = "plain"),
                             just = "center")

layout <- rbind(
  c(1, NA, 2),  # Top row: plot 1, empty, plot 2
  c(NA, 3, NA), # Middle row: empty, plot 3, empty
  c(4, NA, 5)   # Bottom row: plot 4, empty, plot 5
)

combined_plot <- gridExtra::grid.arrange(
  GG_PLOTS_REGISTER[['test_set_used']],      # position 1: top-left
  GG_PLOTS_REGISTER[['statistical_tests']],  # position 2: top-right
  GG_PLOTS_REGISTER[['other_analysis_methods']], # position 3: middle-center
  GG_PLOTS_REGISTER[['ablation_studies']],   # position 4: bottom-left
  GG_PLOTS_REGISTER[['replication_info']],   # position 5: bottom-right
  layout_matrix = layout
)

combined_plot <- gridExtra::grid.arrange(
  title_grob,
  combined_plot,
  ncol = 1,
  heights = c(0.5, 10)  # Title takes less space than the plots
)

ggsave(
  "fig-mv-eval-test-set-tests-abl-rep-pie.pdf",
  plot = combined_plot,
  width = 14,
  height = 8
)
plot(combined_plot)

5 References

# Helper function to format authors
format_authors <- function(authors_field) {
  # Handle different input types
  if (is.null(authors_field) || length(authors_field) == 0) return("")
  
  # If it's a list, extract the first element
  if (is.list(authors_field)) {
    if (length(authors_field[[1]]) == 0) return("")
    authors_str <- authors_field[[1]]
  } else {
    authors_str <- authors_field
  }
  
  # Handle vectors - take first non-NA element
  if (length(authors_str) > 1) {
    authors_str <- authors_str[1]
  }
  
  # Check if it's NA or empty - handle single element only
  if (length(authors_str) == 0 || is.na(authors_str[1]) || authors_str[1] == "") return("")
  
  # Convert to character if needed
  authors_str <- as.character(authors_str[1])
  
  # Split authors and format as "First Initial. Last Name"
  authors <- strsplit(authors_str, " and ")[[1]]
  
  formatted_authors <- sapply(authors, function(author) {
    author <- trimws(author)  # Clean whitespace
    
    # Handle "Last, First" format
    if (grepl(",", author)) {
      parts <- strsplit(author, ",")[[1]]
      last_name <- trimws(parts[1])
      first_name <- trimws(parts[2])
      
      # Get first initial(s) - handle middle names too
      name_parts <- strsplit(first_name, " ")[[1]]
      initials <- sapply(name_parts, function(name) {
        if (nchar(trimws(name)) > 0) {
          paste0(substr(trimws(name), 1, 1), ".")
        } else {
          ""
        }
      })
      initials <- paste(initials[initials != ""], collapse = " ")
      
      return(paste0(initials, " ", last_name))
    } else {
      # Handle "First Last" format
      parts <- strsplit(trimws(author), " ")[[1]]
      if (length(parts) >= 2) {
        # First name initial(s)
        first_parts <- parts[1:(length(parts)-1)]
        initials <- sapply(first_parts, function(name) {
          if (nchar(trimws(name)) > 0) {
            paste0(substr(trimws(name), 1, 1), ".")
          } else {
            ""
          }
        })
        initials <- paste(initials[initials != ""], collapse = " ")
        
        # Last name
        last_name <- parts[length(parts)]
        return(paste0(initials, " ", last_name))
      }
      return(author)  # Return as-is if can't parse
    }
  })
  
  return(paste(formatted_authors, collapse = ", "))
}

safe_extract <- function(field) {
  if (is.null(field)) return("")
  if (is.list(field)) {
    if (length(field) == 0 || length(field[[1]]) == 0) return("")
    result <- field[[1]][1]
  } else {
    if (length(field) == 0) return("")
    result <- field[1]
  }
  
  if (is.na(result)) return("")
  return(as.character(result))
}

# Format journal article
format_article <- function(authors, title, year, entry, doi) {
  journal <- safe_extract(entry$JOURNAL)
  volume <- safe_extract(entry$VOLUME)
  pages <- safe_extract(entry$PAGES)
  number <- safe_extract(entry$NUMBER)
  
  # Check if it's an article number format (e.g., e00205)
  is_article_number <- pages != "" && grepl("^e[0-9]+", pages)
  
  if (is_article_number) {
    # Format with article number
    ref <- paste0(authors, ", ", year, ". ", title, ". ", journal, ". ", volume, ", ", pages, ".")
  } else {
    # Standard format
    volume_info <- if (volume != "") paste0(" ", volume) else ""
    page_info <- if (pages != "") paste0(" ", gsub("-", " – ", pages)) else ""
    ref <- paste0(authors, ", ", title, ", ", journal, volume_info, " (", year, ")", page_info, ".")
  }
  
  if (doi != "") ref <- paste0(ref, " ", doi, ".")
  return(ref)
}

# Format book
format_book <- function(authors, title, year, entry) {
  publisher <- safe_extract(entry$PUBLISHER)
  address <- safe_extract(entry$ADDRESS)
  edition <- safe_extract(entry$EDITION)
  
  edition_text <- if (edition != "") paste0(edition, " ed., ") else ""
  location <- if (address != "") paste0(", ", address) else ""
  
  ref <- paste0(authors, ", ", title, ", ", edition_text, publisher, location, ", ", year, ".")
  return(ref)
}

# Format book chapter
format_chapter <- function(authors, title, year, entry) {
  booktitle <- safe_extract(entry$BOOKTITLE)
  editor <- format_authors(entry$EDITOR)
  publisher <- safe_extract(entry$PUBLISHER)
  address <- safe_extract(entry$ADDRESS)
  pages <- safe_extract(entry$PAGES)
  
  editor_text <- if (editor != "") paste0("in: ", editor, " (Eds.), ") else ""
  location <- if (address != "") paste0(", ", address) else ""
  page_info <- if (pages != "") paste0(", pp. ", gsub("-", " - ", pages)) else ""
  
  ref <- paste0(authors, ", ", title, ", ", editor_text, booktitle, ", ", 
                publisher, location, ", ", year, page_info, ".")
  return(ref)
}

# Format miscellaneous (websites, datasets)
format_misc <- function(authors, title, year, entry, url, doi) {
  howpublished <- safe_extract(entry$HOWPUBLISHED)
  note <- safe_extract(entry$NOTE)
  
  # Detect if it's a website or dataset
  is_dataset <- grepl("dataset|data", paste(title, howpublished, note), ignore.case = TRUE)
  
  if (is_dataset) {
    ref <- paste0(authors, ", ", title, " [dataset], ", howpublished, ", ", year, ".")
    if (doi != "") ref <- paste0(ref, " ", doi, ".")
  } else {
    # Website format
    access_info <- if (note != "") paste0(" (", note, ")") else ""
    ref <- paste0(authors, ", ", title, ". ", url, ", ", year, access_info, ".")
  }
  
  return(ref)
}

# Format software
format_software <- function(authors, title, year, entry, doi) {
  howpublished <- safe_extract(entry$HOWPUBLISHED)
  version <- safe_extract(entry$VERSION)
  note <- safe_extract(entry$NOTE)
  
  version_text <- if (version != "") paste0(" ", version) else ""
  date_info <- if (note != "") paste0(", ", note) else ""
  
  ref <- paste0(authors, ", ", title, version_text, " [software], ", 
                howpublished, date_info, ", ", year, ".")
  if (doi != "") ref <- paste0(ref, " ", doi, ".")
  return(ref)
}


# Default format
format_default <- function(authors, title, year, entry, doi, url) {
  ref <- paste0(authors, ", ", title, " (", year, ").")
  if (url != "") ref <- paste0(ref, " ", url, ".")
  if (doi != "") ref <- paste0(ref, " ", doi, ".")
  return(ref)
}

format_proceedings <- function(authors, title, year, entry, doi) {
  booktitle <- safe_extract(entry$BOOKTITLE)
  pages <- safe_extract(entry$PAGES)
  publisher <- safe_extract(entry$PUBLISHER)
  address <- safe_extract(entry$ADDRESS)
  
  location <- if (address != "") paste0(", ", address) else ""
  page_info <- if (pages != "") paste0(", pp. ", gsub("-", " – ", pages)) else ""
  publisher_info <- if (publisher != "") paste0(", ", publisher) else ""
  
  ref <- paste0(authors, ", ", title, ", in: ", booktitle, publisher_info, 
                location, ", ", year, page_info, ".")
  
  if (doi != "") ref <- paste0(ref, " ", doi, ".")
  return(ref)
}

format_bibliography <- function(bib_df) {
  if (is.null(bib_df) || nrow(bib_df) == 0) {
    return("No bibliography entries found.")
  }
  
  formatted_refs <- character(nrow(bib_df))
  
  for (i in 1:nrow(bib_df)) {
    entry <- bib_df[i, ]
    ref_type <- tolower(safe_extract(entry$CATEGORY))
    
    # Extract common fields safely
    bibtexkey <- entry$BIBTEXKEY
    authors <- format_authors(entry$AUTHOR)
    title <- safe_extract(entry$TITLE)
    year <- safe_extract(entry$YEAR)
    doi <- safe_extract(entry$DOI)
    if (doi != "") doi <- paste0("https://doi.org/", doi)
    url <- safe_extract(entry$URL)
    
    # Format based on entry type
    formatted_ref <- tryCatch({
      switch(ref_type,
        "article" = format_article(authors, title, year, entry, doi),
        "book" = format_book(authors, title, year, entry),
        "incollection" = ,
        "inbook" = format_chapter(authors, title, year, entry),
        "inproceedings" = format_proceedings(authors, title, year, entry, doi),
        "misc" = format_misc(authors, title, year, entry, url, doi),
        "techreport" = ,
        "manual" = format_software(authors, title, year, entry, doi),
        # Default format
        format_default(authors, title, year, entry, doi, url)
      )
    }, error = function(e) {
      # Fallback format if there's an error
      paste0(authors, ", ", title, " (", year, ").")
    })
    
    formatted_refs[i] <- paste0("[", i, "] ", "[", bibtexkey, "] ", formatted_ref)
  }
  
  return(paste(formatted_refs, collapse = "\n\n"))
}
# Load and format bibliography
bibligraphy <- load_bibliography('./bibtex-information-fusion-document-classification.bib')
formatted_bibliography <- format_bibliography(bibligraphy)
cat(formatted_bibliography[1])
## [1] [wangq2022] Q. Wang, Cross-domain structure preserving projection for heterogeneous domain adaptation, Pattern Recognition 123 (2022) 108362. https://doi.org/10.1016/j.patcog.2021.108362.
## 
## [2] [zhao2023] F. Zhao, Topic identification of text-based expert stock comments using multi-level information fusion, Expert Systems 40 (2020). https://doi.org/10.1111/exsy.12641.
## 
## [3] [cgoncalves2022] C. A. Gon<U+FFFD>alves, A Novel Multi-View Ensemble Learning Architecture to Improve the Structured Text Classification, Information 13 (2022) 283. https://doi.org/10.3390/info13060283.
## 
## [4] [reil2023] L. Rei, Multimodal metadata assignment for cultural heritage artifacts, Multimedia Systems 29 (2022) 847 –  – 869. https://doi.org/10.1007/s00530-022-01025-2.
## 
## [5] [debreuij2020] J. A. d. Bruijn, Improving the classification of flood tweets with contextual hydrological information in a multimodal neural network, Computers &amp; Geosciences 140 (2020) 104485. https://doi.org/10.1016/j.cageo.2020.104485.
## 
## [6] [liuw2021] W. Liu, Research on Multi-label Text Classification Method Based on tALBERT-CNN, International Journal of Computational Intelligence Systems 14 (2021) 201. https://doi.org/10.1007/s44196-021-00055-4.
## 
## [7] [guod2023] D. Guo, A Comparative Study of Speaker Role Identification in Air Traffic Communication Using Deep Learning Approaches, ACM Transactions on Asian and Low-Resource Language Information Processing 22 (2023) 1 –  – 17. https://doi.org/10.1145/3572792.
## 
## [8] [chatziagapia2022] A. Chatziagapi, Audio and ASR-based Filled Pause Detection, in: 2022 10th International Conference on Affective Computing and Intelligent Interaction (ACII), IEEE, 2022, pp. 1 –  – 7. https://doi.org/10.1109/acii55700.2022.9953889.
## 
## [9] [andriyanovn2022] N. A. Andriyanov, Combining Text and Image Analysis Methods for Solving Multimodal Classification Problems, Pattern Recognition and Image Analysis 32 (2022) 489 –  – 494. https://doi.org/10.1134/s1054661822030026.
## 
## [10] [jiangs2024] S. Jiang, Deep Learning for Technical Document Classification, IEEE Transactions on Engineering Management 71 (2024) 1163 –  – 1179. https://doi.org/10.1109/tem.2022.3152216.
## 
## [11] [luox2022] X. Luo, Effective short text classification via the fusion of hybrid features for IoT social data, Digital Communications and Networks 8 (2022) 942 –  – 954. https://doi.org/10.1016/j.dcan.2022.09.015.
## 
## [12] [kanchid2022] S. Kanchi, EmmDocClassifier: Efficient Multimodal Document Image Classifier for Scarce Data, Applied Sciences 12 (2022) 1457. https://doi.org/10.3390/app12031457.
## 
## [13] [adwaithd2022] D. Adwaith, Enhancing multimodal disaster tweet classification using state-of-the-art deep learning networks, Multimedia Tools and Applications 81 (2022) 18483 –  – 18501. https://doi.org/10.1007/s11042-022-12217-3.
## 
## [14] [liuj2022] J. Liu, Identifying Adverse Drug Reaction-Related Text from Social Media: A Multi-View Active Learning Approach with Various Document Representations, Information 13 (2022) 189. https://doi.org/10.3390/info13040189.
## 
## [15] [sangy2022] Y. Sang, MBTI Personality Prediction for Fictional Characters Using Movie Scripts, in: Findings of the Association for Computational Linguistics: EMNLP 2022, Association for Computational Linguistics (ACL), 2022, pp. 6715 –  – 6724. https://doi.org/10.18653/v1/2022.findings-emnlp.500.
## 
## [16] [paraskevopoulos2022] G. Paraskevopoulos, Multimodal Classification of Safety-Report Observations, Applied Sciences 12 (2022) 5781. https://doi.org/10.3390/app12125781.
## 
## [17] [sapeao2022] O. Sapena, Multimodal Classification of Teaching Activities from University Lecture Recordings, Applied Sciences 12 (2022) 4785. https://doi.org/10.3390/app12094785.
## 
## [18] [arlqaraleshs2024] S. Alqaraleh, Multimodal Classifier for Disaster Response, Advanced Engineering, Technology and Applications, Springer Nature Switzerland, 2023, pp. 1 -  - 13.
## 
## [19] [karisanip2022] P. Karisani, Multi-View Active Learning for Short Text Classification in User-Generated Data, in: Findings of the Association for Computational Linguistics: EMNLP 2022, Association for Computational Linguistics (ACL), 2022, pp. 6441 –  – 6453. https://doi.org/10.18653/v1/2022.findings-emnlp.481.
## 
## [20] [yuet2022] T. Yue, PaperNet: A Dataset and Benchmark for Fine-Grained Paper Classification, Applied Sciences 12 (2022) 4554. https://doi.org/10.3390/app12094554.
## 
## [21] [guq2022] Q. Gu, QiNiAn at SemEval-2022 Task 5: Multi-Modal Misogyny Detection and Classification, in: Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022), Association for Computational Linguistics (ACL), 2022, pp. 736 –  – 741. https://doi.org/10.18653/v1/2022.semeval-1.102.
## 
## [22] [dongpin2022] L. Dongping, Research on Deep Learning Model of Multimodal Heterogeneous Data Based on LSTM, IAENG International Journal of Computer Science 49 (2022).
## 
## [23] [chenl2022] L. Chen, Utilizing Cross-Modal Contrastive Learning to Improve Item Categorization BERT Model, in: Proceedings of The Fifth Workshop on e-Commerce and NLP (ECNLP 5), Association for Computational Linguistics (ACL), 2022, pp. 217 –  – 223. https://doi.org/10.18653/v1/2022.ecnlp-1.25.
## 
## [24] [ortizperez2023] D. Ortiz-Perez, A Deep Learning-Based Multimodal Architecture to predict Signs of Dementia, Neurocomputing 548 (2023) 126413. https://doi.org/10.1016/j.neucom.2023.126413.
## 
## [25] [chos2023] S. Cho, A Framework for Understanding Unstructured Financial Documents Using RPA and Multimodal Approach, Electronics 12 (2023) 939. https://doi.org/10.3390/electronics12040939.
## 
## [26] [fujinumay2023] Y. Fujinuma, A Multi-Modal Multilingual Benchmark for Document Image Classification, in: Findings of the Association for Computational Linguistics: EMNLP 2023, Association for Computing Machinery (ACM), 2023, pp. 14361 –  – 14376. https://doi.org/10.18653/v1/2023.findings-emnlp.958.
## 
## [27] [shah2023] H. Shah, Building a Multimodal Classifier of Email Behavior: Towards a Social Network Understanding of Organizational Communication, Information 14 (2023) 661. https://doi.org/10.3390/info14120661.
## 
## [28] [rasheeda2023] A. Rasheed, Cover-based multiple book genre recognition using an improved multimodal network, International Journal on Document Analysis and Recognition (IJDAR) 26 (2022) 65 –  – 88. https://doi.org/10.1007/s10032-022-00413-8.
## 
## [29] [liut2024] T. Liu, Cross-modal Multiple Granularity Interactive Fusion Network for Long Document Classification, ACM Transactions on Knowledge Discovery from Data 18 (2024) 1 –  – 24. https://doi.org/10.1145/3631711.
## 
## [30] [samya2023] A. E. Samy, Data-Driven Self-Supervised Graph Representation Learning, ECAI 2023, IOS Press, 2023, pp. 629 - 636.
## 
## [31] [jarrahia2023] A. Jarrahi, Evaluating the effectiveness of publishers features in fake news detection on social media, Multimedia Tools and Applications 82 (2022) 2913 –  – 2939. https://doi.org/10.1007/s11042-022-12668-8.
## 
## [32] [liangz2023] Z. Liang, Fake News Detection Based on Multimodal Inputs, Computers, Materials &amp; Continua 75 (2023) 4519 –  – 4534. https://doi.org/10.32604/cmc.2023.037035.
## 
## [33] [kozienkop2023] P. Kazienko, Human-centered neural reasoning for subjective content processing: Hate speech, emotions, and humor, Information Fusion 94 (2023) 43 –  – 65. https://doi.org/10.1016/j.inffus.2023.01.010.
## 
## [34] [graffm2023] M. Graff, Ingeotec at DA-VINCIS: Bag-of-Words Classifiers, in: , CEUR-WS, 2023.
## 
## [35] [varmanp2023] P. K. Verma, MCred: multi-modal message credibility for fake news detection using BERT and CNN, Journal of Ambient Intelligence and Humanized Computing 14 (2022) 10617 –  – 10629. https://doi.org/10.1007/s12652-022-04338-2.
## 
## [36] [tianl2023] L. Tian, MetaTroll: Few-shot Detection of State-Sponsored Trolls with Transformer Adapters, in: Proceedings of the ACM Web Conference 2023, Association for Computing Machinery (ACM), 2023, pp. 1743 – 1753. https://doi.org/10.1145/3543507.3583417.
## 
## [37] [kenny2023] Kenny, MULTIMODAL APPROACH FOR EMOTION RECOGNITION USING FEATURE FUSION, ICIC Express Letters 17 (2023) 181 – 189. https://doi.org/10.24507/icicel.17.02.181.
## 
## [38] [linckere2023] E. Lincker, Noisy and Unbalanced Multimodal Document Classification: Textbook Exercises as a Use Case, in: 20th International Conference on Content-based Multimedia Indexing, Association for Computing Machinery (ACM), 2023, pp. 71 –  – 78. https://doi.org/10.1145/3617233.3617239.
## 
## [39] [jarquin2023] H. Jarqu<U+FFFD>n-V<U+FFFD>squez, Overview of DA-VINCIS at IberLEF 2023: Detection of Aggressive and Violent Incidents from Social Media in Spanish, Procesamiento del Lenguaje Natural (2023) 351360. https://doi.org/10.26342/2023-71-27.
## 
## [40] [luzdearau2023] P. H. Luz de Araujo, Sequence-aware multimodal page classification of Brazilian legal documents, International Journal on Document Analysis and Recognition (IJDAR) 26 (2022) 33 –  – 49. https://doi.org/10.1007/s10032-022-00406-7.
## 
## [41] [chenz2023] Z. Chen, Towards Unifying Medical Vision-and-Language Pre-training via Soft Prompts, in: 2023 IEEE/CVF International Conference on Computer Vision (ICCV), IEEE, 2023, pp. 23346 –  – 23356. https://doi.org/10.1109/iccv51070.2023.02139.
## 
## [42] [zouh2023] H. Zou, UniS-MMC: Multimodal Classification via Unimodality-supervised Multimodal Contrastive Learning, in: Findings of the Association for Computational Linguistics: ACL 2023, Association for Computational Linguistics (ACL), 2023, pp. 659 – 672. https://doi.org/10.18653/v1/2023.findings-acl.41.
## 
## [43] [bakkalis2023] S. Bakkali, VLCDoC: Vision-Language contrastive pre-training model for cross-Modal document classification, Pattern Recognition 139 (2023) 109419. https://doi.org/10.1016/j.patcog.2023.109419.
## 
## [44] [wajdm2024] M. A. Wajid, A deep learning approach for image and text classification using neutrosophy, International Journal of Information Technology 16 (2023) 853 –  – 859. https://doi.org/10.1007/s41870-023-01529-8.
## 
## [45] [fengz2024] Z. Feng, Adaptive micro- and macro-knowledge incorporation for hierarchical text classification, Expert Systems with Applications 248 (2024) 123374. https://doi.org/10.1016/j.eswa.2024.123374.
## 
## [46] [xuy2024] Y. Xu, An effective multi-modal adaptive contextual feature information fusion method for Chinese long text classification, Artificial Intelligence Review 57 (2024) 1 – 29. https://doi.org/10.1007/s10462-024-10835-x.
## 
## [47] [jiz2024] Z. Ji, ASSL-HGAT: Active semi-supervised learning empowered heterogeneous graph attention network, Knowledge-Based Systems 290 (2024) 111567. https://doi.org/10.1016/j.knosys.2024.111567.
## 
## [48] [ghorbanali2024] A. Ghorbanali, Capsule network-based deep ensemble transfer learning for multimodal sentiment analysis, Expert Systems with Applications 239 (2024) 122454. https://doi.org/10.1016/j.eswa.2023.122454.
## 
## [49] [yushili2024] Y. Li, Compact bilinear pooling and multi-loss network for social media multimodal classification, Signal, Image and Video Processing 18 (2024) 8403 –  – 8412. https://doi.org/10.1007/s11760-024-03482-w.
## 
## [50] [zhangqi2024] Z. Jiang, Deep Incomplete Multi-View Learning Network with Insufficient Label Information, Proceedings of the AAAI Conference on Artificial Intelligence 38 (2024) 12919 –  – 12927. https://doi.org/10.1609/aaai.v38i11.29189.
## 
## [51] [xianfang2024] X. Song, Evolutionary computation for feature selection in classification: A comprehensive survey of solutions, applications and challenges, Swarm and Evolutionary Computation 90 (2024) 101661. https://doi.org/10.1016/j.swevo.2024.101661.
## 
## [52] [zhangy2024] Y. Zhang, Image Information Prompt: Tips for Learning Large Language Models, Intelligent Computing Technology and Automation, IOS Press, 2024, pp. undefined - undefined.
## 
## [53] [liub2024] B. Liu, MinJoT: Multimodal infusion Joint Training for noise learning in text and multimodal classification problems, Information Fusion 102 (2024) 102071. https://doi.org/10.1016/j.inffus.2023.102071.
## 
## [54] [tengfeil2024] T. Liu, Multi-modal long document classification based on Hierarchical Prompt and Multi-modal Transformer, Neural Networks 176 (2024) 106322. https://doi.org/10.1016/j.neunet.2024.106322.
## 
## [55] [ronghaop2024] R. Pan, Spanish MEACorpus 2023: A multimodal speechtext corpus for emotion analysis in Spanish from natural environments, Computer Standards &amp; Interfaces 90 (2024) 103856. https://doi.org/10.1016/j.csi.2024.103856.
## 
## [56] [dasigiv2001] V. Dasigi, Information fusion for text classification an experimental comparison, Pattern Recognition 34 (2001) 2413 –  – 2425. https://doi.org/10.1016/s0031-3203(00)00171-0.
## 
## [57] [matsubara2005] E. T. Matsubara, Multi-view semi-supervised learning: An approach to obtain different views from text datasets, in: , IOS Press BV, 2005, pp. 97 – 104.
## 
## [58] [zhangb2008] B. Zhang, Co-EM Support Vector Machine Based Text Classification from Positive and Unlabeled Examples, in: 2008 First International Conference on Intelligent Networks and Intelligent Systems, IEEE, 2008, pp. 745 –  – 748. https://doi.org/10.1109/icinis.2008.29.
## 
## [59] [suns2008] S. Sun, Semantic Features for Multi-view Semi-supervised and Active Learning of Text Classification, in: 2008 IEEE International Conference on Data Mining Workshops, IEEE, 2008, pp. 731 –  – 735. https://doi.org/10.1109/icdmw.2008.13.
## 
## [60] [gup2009] P. Gu, A multi-view approach to semi-supervised document classification with incremental Naive Bayes, Computers &amp; Mathematics with Applications 57 (2009) 1030 –  – 1036. https://doi.org/10.1016/j.camwa.2008.10.025.
## 
## [61] [zhangx2009] X. Zhang, Batch Mode Active Learning Based Multi-view Text Classification, in: 2009 Sixth International Conference on Fuzzy Systems and Knowledge Discovery, IEEE, 2009, pp. 472 –  – 476. https://doi.org/10.1109/fskd.2009.495.
## 
## [62] [chenb2009] B. Chen, Document Classification with One-class Multiview Learning, in: 2009 International Conference on Industrial and Information Systems, IEEE, 2009, pp. 289 –  – 292. https://doi.org/10.1109/iis.2009.15.
## 
## [63] [aminim2009] M. Amini, Learning from Multiple Partially Observed Views - an Application to Multilingual Text Categorization, in: , Neural Information Processing Systems, 2009, pp. 28 – 36.
## 
## [64] [chens2009] S. D. Chen, Meta-classifiers for multimodal document classification, in: 2009 IEEE International Workshop on Multimedia Signal Processing, IEEE, 2009, pp. 1 –  – 6. https://doi.org/10.1109/mmsp.2009.5293343.
## 
## [65] [aminim2010] M. Amini, A co-classification approach to learning from<U+FFFD>multilingual corpora, Machine Learning 79 (2009) 105 –  – 121. https://doi.org/10.1007/s10994-009-5151-5.
## 
## [66] [zhangx2010b] X. Zhang, A general decision layer text classification fusion model, in: 2010 2nd International Conference on Education Technology and Computer, IEEE, 2010, pp. V5 – 239 – V5 – 241. https://doi.org/10.1109/icetc.2010.5529774.
## 
## [67] [suns2010] S. Sun, Active learning with extremely sparse labeled examples, Neurocomputing 73 (2010) 2980 –  – 2988. https://doi.org/10.1016/j.neucom.2010.07.007.
## 
## [68] [aminim2010b] M. R. Amini, Combining coregularization and consensus-based self-training for multilingual text categorization, in: Proceedings of the 33rd international ACM SIGIR conference on Research and development in information retrieval, ACM, 2010, pp. 475 –  – 482. https://doi.org/10.1145/1835449.1835529.
## 
## [69] [perezgraciat2010] T. P<U+FFFD>rez-Garc<U+FFFD>a, Harmonic and instrumental information fusion for musical genre classification, in: Proceedings of 3rd international workshop on Machine learning and music, ACM, 2010, pp. 49 –  – 52. https://doi.org/10.1145/1878003.1878020.
## 
## [70] [zhangx2010] X. Zhang, Study on Multi-layer Fusion Classification Model of Multi-media Information, in: 2010 International Conference on Web Information Systems and Mining, IEEE, 2010, pp. 216 –  – 218. https://doi.org/10.1109/wism.2010.126.
## 
## [71] [zhengw2011] W. Zheng, Dimensionality Reduction with Category Information Fusion and Non-negative Matrix Factorization for Text Categorization, Artificial Intelligence and Computational Intelligence, Springer Berlin Heidelberg, 2011, pp. 505 -  - 512.
## 
## [72] [guyo2012] Y. Guo, Cross Language Text Classification via Subspace Co-regularized Multi-view Learning, in: , 2012, pp. 1615 – 1622. https://doi.org/10.48550/arxiv.1206.6481.
## 
## [73] [kovesim2012] M. Kovesi, Fast on-line learning for multilingual categorization, in: Proceedings of the 35th international ACM SIGIR conference on Research and development in information retrieval, ACM, 2012, pp. 1071 –  – 1072. https://doi.org/10.1145/2348283.2348474.
## 
## [74] [yangp2012] P. Yang, Information-theoretic Multi-view Domain Adaptation, in: , 2012, pp. 270 – 274.
## 
## [75] [lig2012] G. Li, Multiview Semi-Supervised Learning with Consensus, IEEE Transactions on Knowledge and Data Engineering 24 (2012) 2040 –  – 2051. https://doi.org/10.1109/tkde.2011.160.
## 
## [76] [zhangb2013] B. Zhang, Classification of big velocity data via cross-domain Canonical Correlation Analysis, in: 2013 IEEE International Conference on Big Data, IEEE, 2013, pp. 493 –  – 498. https://doi.org/10.1109/bigdata.2013.6691612.
## 
## [77] [liy2013] Y. LI, Combination of multiple feature selection methods for text categorization by using combinatorial fusion analysis and rank-score characteristic, International Journal on Artificial Intelligence Tools 22 (2013) 1350001. https://doi.org/10.1142/s0218213013500012.
## 
## [78] [perinaa2013] A. Perina, Documents as multiple overlapping windows into grids of counts, in: , Neural information processing systems foundation, 2013.
## 
## [79] [longg2013] G. Long, Graph Based Feature Augmentation for Short and Sparse Text Classification, Advanced Data Mining and Applications, Springer Berlin Heidelberg, 2013, pp. 456 -  - 467.
## 
## [80] [zhangd2013] D. Zhang, MI2LS: multi-instance learning from multiple informationsources, in: Proceedings of the 19th ACM SIGKDD international conference on Knowledge discovery and data mining, Association for Computing Machinery (ACM), 2013, pp. 149 –  – 157. https://doi.org/10.1145/2487575.2487651.
## 
## [81] [cristanim2014] M. Cristani, A Multimodal Approach to Exploit Similarity in Documents, Modern Advances in Applied Intelligence, Springer International Publishing, 2014, pp. 490 -  - 499.
## 
## [82] [liuj2014] J. Liu, An Embedded Co-AdaBoost based construction of software document relation coupled resource spaces for cyberphysical society, Future Generation Computer Systems 32 (2014) 198 –  – 210. https://doi.org/10.1016/j.future.2012.12.017.
## 
## [83] [yangp2014] P. Yang, Information-Theoretic Multi-view Domain Adaptation: A Theoretical and Empirical Study, Journal of Artificial Intelligence Research 49 (2014) 501 –  – 525. https://doi.org/10.1613/jair.4190.
## 
## [84] [liparas2014] D. Liparas, News Articles Classification Using Random Forests and Weighted Multimodal Features, Multidisciplinary Information Retrieval, Springer International Publishing, 2014, pp. 63 -  - 75.
## 
## [85] [liaox2015] X. L. Liao, A Multi-topic Meta-classification Scheme for Analyzing Lobbying Disclosure Data, in: 2015 IEEE International Conference on Information Reuse and Integration, IEEE, 2015, pp. 349 –  – 356. https://doi.org/10.1109/iri.2015.60.
## 
## [86] [brefeldu2015] U. Brefeld, Multi-view learning with dependent views, in: Proceedings of the 30th Annual ACM Symposium on Applied Computing, Association for Computing Machinery (ACM), 2015, pp. 865 –  – 870. https://doi.org/10.1145/2695664.2695829.
## 
## [87] [fakri2015] A. Fakeri-Tabrizi, Multiview self-learning, Neurocomputing 155 (2015) 117 –  – 127. https://doi.org/10.1016/j.neucom.2014.12.041.
## 
## [88] [iglesias2016] E. L. Iglesias, An HMM-Based Multi-view Co-training Framework for Single-View Text Corpora, Hybrid Artificial Intelligent Systems, Springer International Publishing, 2016, pp. 66 -  - 78.
## 
## [89] [rajendran2016] J. Rajendran, Bridge Correlational Neural Networks for Multilingual Multimodal Representation Learning, in: Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Association for Computing Machinery (ACM), 2016, pp. 171 –  – 181. https://doi.org/10.18653/v1/n16-1021.
## 
## [90] [xux2016] X. Xu, Co-Labeling for Multi-View Weakly Labeled Learning, IEEE Transactions on Pattern Analysis and Machine Intelligence 38 (2016) 1113 –  – 1125. https://doi.org/10.1109/tpami.2015.2476813.
## 
## [91] [sinorar2016] R. A. Sinoara, Semantic role-based representations in text classification, in: 2016 23rd International Conference on Pattern Recognition (ICPR), IEEE, 2016, pp. 2313 –  – 2318. https://doi.org/10.1109/icpr.2016.7899981.
## 
## [92] [xuh2016] H. Xu, Text Classification with Topic-based Word Embedding and Convolutional Neural Networks, in: Proceedings of the 7th ACM International Conference on Bioinformatics, Computational Biology, and Health Informatics, Association for Computing Machinery (ACM), 2016, pp. 88 –  – 97. https://doi.org/10.1145/2975167.2975176.
## 
## [93] [huz2017] Z. Hu, A deep learning approach for predicting the quality of online health expert question-answering services, Journal of Biomedical Informatics 71 (2017) 241 –  – 253. https://doi.org/10.1016/j.jbi.2017.06.012.
## 
## [94] [akhtiamovo2017] O. Akhtiamov, Are You Addressing Me? Multimodal Addressee Detection in Human-Human-Computer Conversations, Speech and Computer, Springer International Publishing, 2017, pp. 152 -  - 161.
## 
## [95] [zhanz2017] Z. Zhan, Document Analysis Based on Multi-view Intact Space Learning with Manifold Regularization, Intelligence Science and Big Data Engineering, Springer International Publishing, 2017, pp. 41 -  - 51.
## 
## [96] [schmittm2017] M. Schmitt, openXBOW - Introducing the Passau Open-Source Crossmodal Bag-of-Words Toolkit, Journal of Machine Learning Research 18 (2017) 1 – 5. https://doi.org/10.48550/arxiv.1605.06778.
## 
## [97] [xuc2017] C. Xu, Multimodal Fusion with Global and Local Features for Text Classification, Neural Information Processing, Springer International Publishing, 2017, pp. 124 -  - 134.
## 
## [98] [pengj2018] J. Peng, Multiview Boosting With Information Propagation for Classification, IEEE Transactions on Neural Networks and Learning Systems 29 (2018) 657 –  – 669. https://doi.org/10.1109/tnnls.2016.2637881.
## 
## [99] [argon2018] M. E. Arag<U+FFFD>n, A Straightforward Multimodal Approach for Author Profiling: Notebook for PAN at CLEF 2018, CEUR Workshop Proceedings 2125 (2018).
## 
## [100] [ferreira2018] C. H. P. Ferreira, Combining Multiple Views from a Distance Based Feature Extraction for Text Classification, in: 2018 IEEE Congress on Evolutionary Computation (CEC), IEEE, 2018, pp. 1 –  – 8. https://doi.org/10.1109/cec.2018.8477772.
## 
## [101] [guptad2018] D. Gupta, Empowering First Responders through Automated Multimodal Content Moderation, in: 2018 IEEE International Conference on Cognitive Computing (ICCC), IEEE, 2018, pp. 1 –  – 8. https://doi.org/10.1109/iccc.2018.00008.
## 
## [102] [akhiamov2018] O. Akhtiamov, Gaze, Prosody and Semantics: Relevance of Various Multimodal Signals to Addressee Detection in Human-Human-Computer Conversations, Speech and Computer, Springer International Publishing, 2018, pp. 1 -  - 10.
## 
## [103] [tellez2018] E. S. Tellez, Gender identification through multi-modal Tweet analysis using MicroTC and Bag of Visual Words: Notebook for PAN at CLEF 2018, in: , CEUR-WS, 2018.
## 
## [104] [matricm2018] M. Martinc, Multilingual gender classification with multi-view deep learning notebook for PAN at CLEF 2018, in: , CEUR-WS, 2018.
## 
## [105] [zhup2018] P. Zhu, Multi-view label embedding, Pattern Recognition 84 (2018) 126 –  – 135. https://doi.org/10.1016/j.patcog.2018.07.009.
## 
## [106] [mmironczuk2020] M. M. Mironczuk, Recognising innovative companies by using a diversified stacked generalisation method for website classification, Applied Intelligence 50 (2019) 42 –  – 60. https://doi.org/10.1007/s10489-019-01509-1.
## 
## [107] [anget2018] T. Ange, Semi-Supervised Multimodal Deep Learning Model for Polarity Detection in Arguments, in: 2018 International Joint Conference on Neural Networks (IJCNN), IEEE, 2018, pp. 1 –  – 8. https://doi.org/10.1109/ijcnn.2018.8489342.
## 
## [108] [akhtiamov2019] O. Akhtiamov, A Comparative Study of Classical and Deep Classifiers for Textual Addressee Detection in Human-Human-Machine Conversations, Speech and Computer, Springer International Publishing, 2019, pp. 20 -  - 30.
## 
## [109] [hoylea2019] A. M. Hoyle, Combining Sentiment Lexica with a Multi-View Variational Autoencoder, in: Proceedings of the 2019 Conference of the North, Association for Computational Linguistics (ACL), 2019, pp. 635 –  – 640. https://doi.org/10.18653/v1/n19-1065.
## 
## [110] [wangh2019] H. Wang, Co-regularized multi-view sparse reconstruction embedding for dimension reduction, Neurocomputing 347 (2019) 191 –  – 199. https://doi.org/10.1016/j.neucom.2019.03.080.
## 
## [111] [chens2019] S. Chen, Deep Learning Method with Attention for Extreme Multi-label Text Classification, PRICAI 2019: Trends in Artificial Intelligence, Springer International Publishing, 2019, pp. 179 -  - 190.
## 
## [112] [mmironczuk2019] M. M. Mironczuk, Empirical evaluation of feature projection algorithms for multi-view text classification, Expert Systems with Applications 130 (2019) 97 –  – 112. https://doi.org/10.1016/j.eswa.2019.04.020.
## 
## [113] [ravikiranm2019] M. Ravikiran, Fusing Deep Quick Response Code Representations Improves Malware Text Classification, in: Proceedings of the ACM Workshop on Crossmodal Learning and Application, Association for Computing Machinery (ACM), 2019, pp. 11 –  – 18. https://doi.org/10.1145/3326459.3329166.
## 
## [114] [jainr2019] R. Jain, Multimodal Document Image Classification, in: 2019 International Conference on Document Analysis and Recognition (ICDAR), IEEE, 2019, pp. 71 –  – 77. https://doi.org/10.1109/icdar.2019.00021.
## 
## [115] [wangh2020] H. Wang, Multi-view reconstructive preserving embedding for dimension reduction, Soft Computing 24 (2019) 7769 –  – 7780. https://doi.org/10.1007/s00500-019-04395-4.
## 
## [116] [hey2019] Y. He, Multi-view transfer learning with privileged learning framework, Neurocomputing 335 (2019) 131 –  – 142. https://doi.org/10.1016/j.neucom.2019.01.019.
## 
## [117] [bhatt2019] G. Bhatt, Representation learning using step-based deep multi-modal autoencoders, Pattern Recognition 95 (2019) 12 –  – 23. https://doi.org/10.1016/j.patcog.2019.05.032.
## 
## [118] [zhu2020] W. Zhu, A Synchronized Word Representation Method With Dual Perceptual Information, IEEE Access 8 (2020) 22335 –  – 22344. https://doi.org/10.1109/access.2020.2969983.
## 
## [119] [carmona2020] M. <U+FFFD>. <U+FFFD>lvarez Carmona, Author Profiling in Social Media with Multimodal Information, Computaci<U+FFFD>n y Sistemas 24 (2020) 1289 – 1304. https://doi.org/10.13053/cys-24-3-3488.
## 
## [120] [doinychko2020] A. Doinychko, Biconditional Generative Adversarial Networks for Multiview Learning with Missing Views, Advances in Information Retrieval, Springer International Publishing, 2020, pp. 807 -  - 820.
## 
## [121] [hessel2020] J. Hessel, Does my multimodal model learn cross-modal interactions? Its harder to tell than you might think!, in: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), Association for Computational Linguistics (ACL), 2020, pp. 861 – 877. https://doi.org/10.18653/v1/2020.emnlp-main.62.
## 
## [122] [braz2020] L. Braz, ImTeNet: Image-Text Classification Network for Abnormality Detection and Automatic Reporting on Musculoskeletal Radiographs, Advances in Bioinformatics and Computational Biology, Springer International Publishing, 2020, pp. 150 -  - 161.
## 
## [123] [dacosta2022] T. A. G. d. Costa, Providing a greater precision of Situational Awareness of urban floods through Multimodal Fusion, Expert Systems with Applications 188 (2022) 115923. https://doi.org/10.1016/j.eswa.2021.115923.
## 
## [124] [garg2021] S. Garg, On-Device Document Classification using multimodal features, in: Proceedings of the 3rd ACM India Joint International Conference on Data Science &amp; Management of Data (8th ACM IKDD CODS &amp; 26th COMAD), ACM, 2021, pp. 203 –  – 207. https://doi.org/10.1145/3430984.3431030.
## 
## [125] [huc2021] C. Hu, One-class Text Classification with Multi-modal Deep Support Vector Data Description, in: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, Association for Computational Linguistics, 2021, pp. 3378 –  – 3390. https://doi.org/10.18653/v1/2021.eacl-main.296.
## 
## [126] [gui2021] M. Gui, Technology Forecasting Using Deep Learning Neural Network: Taking the Case of Robotics, IEEE Access 9 (2021) 53306 –  – 53316. https://doi.org/10.1109/access.2021.3070105.
## 
## [127] [gallo2021] I. Gallo, Visual Word Embedding for Text Classification, Pattern Recognition. ICPR International Workshops and Challenges, Springer International Publishing, 2021, pp. 339 -  - 352.
## 
## [128] [setiawan2021] E. Setiawan, Multiview Sentiment Analysis with Image-Text-Concept Features of Indonesian Social Media Posts, International Journal of Intelligent Engineering and Systems 14 (2021) 521 –  – 535. https://doi.org/10.22266/ijies2021.0430.47.
## 
## [129] [jiax2021] X. Jia, Semi-Supervised Multi-View Deep Discriminant Representation Learning, IEEE Transactions on Pattern Analysis and Machine Intelligence 43 (2021) 2496 –  – 2509. https://doi.org/10.1109/tpami.2020.2973634.
## 
## [130] [sus2021] S. Su, A Dynamic Discriminative Canonical Correlation Analysis via Adaptive Weight Scheme, IEEE Access 9 (2021) 142653 –  – 142663. https://doi.org/10.1109/access.2021.3118023.
## 
## [131] [guelorget2021] P. Gu<U+FFFD>lorget, Active learning to measure opinion and violence in French newspapers, Procedia Computer Science 192 (2021) 202 –  – 211. https://doi.org/10.1016/j.procs.2021.08.021.
## 
## [132] [liang2021] Y. Liang, Fusion of heterogeneous attention mechanisms in multi-view convolutional neural network for text classification, Information Sciences 548 (2021) 295 –  – 312. https://doi.org/10.1016/j.ins.2020.10.021.
## 
## [133] [wang2021] X. Wang, Implicit Emotion Relationship Mining Based on Optimal and Majority Synthesis From Multimodal Data Prediction, IEEE MultiMedia 28 (2021) 96 –  – 105. https://doi.org/10.1109/mmul.2021.3071495.
## 
## [134] [zhang2021] Y. Zhang, Learning sentiment sentence representation with multiview attention model, Information Sciences 571 (2021) 459 –  – 474. https://doi.org/10.1016/j.ins.2021.05.044.
## 
## [135] [zingaro2021] S. P. Zingaro, Multimodal Side- Tuning for Document Classification, in: 2020 25th International Conference on Pattern Recognition (ICPR), IEEE, 2021, pp. 5206 –  – 5213. https://doi.org/10.1109/icpr48806.2021.9413208.
## 
## [136] [max2020] X. Ma, Particle Filter Recurrent Neural Networks, in: , Association for the Advancement of Artificial Intelligence (AAAI), 2020, pp. 5101 –  – 5108. https://doi.org/10.1609/aaai.v34i04.5952.
## 
## [137] [maf2020] F. Ma, Self-paced multi-view co-training, Journal of Machine Learning Research 21 (2020).
## 
## [138] [lij2020] J. Li, TextShield: Robust Text Classification Based on Multimodal Embedding and Neural Machine Translation, in: , 2020.
## 
## [139] [ma2021] C. Ma, On the (In)Effectiveness of Images for Text Classification, in: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, Association for Computational Linguistics, 2021, pp. 42 –  – 48. https://doi.org/10.18653/v1/2021.eacl-main.4.